diff --git a/Makefile b/Makefile
index d5741435..69e24d41 100644
--- a/Makefile
+++ b/Makefile
@@ -40,8 +40,8 @@ LLVM_CONFIG=$(shell which llvm-config)
 CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
 
 # Enable ARM by request
-# To enable: make ARM_ENABLED=ON
-ARM_ENABLED=OFF
+# To enable: make ARM_ENABLED=1
+ARM_ENABLED=0
 
 # Add llvm bin to the path so any scripts run will go to the right llvm-config
 LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
@@ -65,7 +65,7 @@ LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
 ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
     LLVM_COMPONENTS+=option
 endif
-ifeq ($(ARM_ENABLED), ON)
+ifneq ($(ARM_ENABLED), 0)
     LLVM_COMPONENTS+=arm
 endif
 LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
@@ -79,6 +79,10 @@ CLANG_LIBS = -lclangFrontend -lclangDriver \
 ISPC_LIBS=$(shell $(LLVM_CONFIG) --ldflags) $(CLANG_LIBS) $(LLVM_LIBS) \
 	-lpthread
 
+ifeq ($(LLVM_VERSION),LLVM_3_4)
+    ISPC_LIBS += -lcurses
+endif
+
 ifeq ($(ARCH_OS),Linux)
 	ISPC_LIBS += -ldl
 endif
@@ -111,7 +115,7 @@ OPT=-O2
 CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE)  \
 	-Wall $(LLVM_VERSION_DEF) \
 	-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
-ifeq ($(ARM_ENABLED), ON)
+ifneq ($(ARM_ENABLED), 0)
     CXXFLAGS+=-DISPC_ARM_ENABLED
 endif
 
@@ -132,10 +136,11 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
+TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
-ifeq ($(ARM_ENABLED), ON)
-    TARGETS+=neon
+ifneq ($(ARM_ENABLED), 0)
+    TARGETS+=neon-32 neon-16 neon-8
 endif
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
@@ -145,12 +150,12 @@ BUILTINS_OBJS_32=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-32bi
 BUILTINS_OBJS_64=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_TARGET:.ll=-64bit.o)))
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC_COMMON:.ll=.o))) \
 	$(BUILTINS_OBJS_32) $(BUILTINS_OBJS_64) \
-	builtins-c-32.cpp builtins-c-64.cpp 
+	builtins-c-32.cpp builtins-c-64.cpp
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -256,12 +261,22 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
 	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | python bitcode2cpp.py c 64 > $@
 
-objs/stdlib_generic_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for generic
-	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py generic > $@
+objs/stdlib_mask1_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask1
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask1 > $@
 
-objs/stdlib_x86_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $< for x86
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
-		python stdlib2cpp.py x86 > $@
+objs/stdlib_mask8_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask8
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask8 > $@
+
+objs/stdlib_mask16_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask16
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask16 > $@
+
+objs/stdlib_mask32_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask32
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		python stdlib2cpp.py mask32 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 7d99abf9..f3a0cf59 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -112,10 +112,7 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
-    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
-        t == LLVMTypes::MaskType)
-        return AtomicType::VaryingBool;
-    else if (t == LLVMTypes::Int8VectorType)
+    if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
@@ -127,6 +124,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return AtomicType::VaryingDouble;
     else if (t == LLVMTypes::Int64VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+    else if (t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
 
     // pointers to uniform
     else if (t == LLVMTypes::Int8PointerType)
@@ -488,7 +487,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",
-        "__pause",
         "__popcnt_int32",
         "__popcnt_int64",
         "__prefetch_read_uniform_1",
@@ -502,6 +500,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__rdrand_i64",
         "__reduce_add_double",
         "__reduce_add_float",
+        "__reduce_add_int8",
+        "__reduce_add_int16",
         "__reduce_add_int32",
         "__reduce_add_int64",
         "__reduce_equal_double",
@@ -581,15 +581,6 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_cos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
@@ -656,7 +647,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         // the values for an ARM target.  This maybe won't cause problems
         // in the generated code, since bulitins.c doesn't do anything too
         // complex w.r.t. struct layouts, etc.
-        if (g->target->getISA() != Target::NEON)
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -819,13 +812,32 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
+
 #ifdef ISPC_ARM_ENABLED
-    case Target::NEON: {
+    case Target::NEON8: {
         if (runtime32) {
-            EXPORT_MODULE(builtins_bitcode_neon_32bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_neon_64bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
+        }
+        break;
+    }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
+        }
+        break;
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
         }
         break;
     }
@@ -865,10 +877,31 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             break;
         case 8:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_32bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_32bit);
+                }
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                if (g->target->getMaskBitCount() == 16) {
+                    EXPORT_MODULE(builtins_bitcode_sse4_16_64bit);
+                }
+                else {
+                    Assert(g->target->getMaskBitCount() == 32);
+                    EXPORT_MODULE(builtins_bitcode_sse4_x2_64bit);
+                }
+            }
+            break;
+        case 16:
+            Assert(g->target->getMaskBitCount() == 8);
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_sse4_8_64bit);
             }
             break;
         default:
@@ -1017,8 +1050,6 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast,
                        module, symbolTable);
-    lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module,
-                       symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
     lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload,
@@ -1040,16 +1071,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
+        extern char stdlib_mask1_code[], stdlib_mask8_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[];
         if (g->target->getISA() == Target::GENERIC &&
-            g->target->getVectorWidth() != 1) { // 1 wide uses x86 stdlib
-            extern char stdlib_generic_code[];
-            yy_scan_string(stdlib_generic_code);
-            yyparse();
+            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
+            yy_scan_string(stdlib_mask32_code);
         }
         else {
-            extern char stdlib_x86_code[];
-            yy_scan_string(stdlib_x86_code);
-            yyparse();
+            switch (g->target->getMaskBitCount()) {
+            case 1:
+                yy_scan_string(stdlib_mask1_code);
+                break;
+            case 8:
+                yy_scan_string(stdlib_mask8_code);
+                break;
+            case 16:
+                yy_scan_string(stdlib_mask16_code);
+                break;
+            case 32:
+                yy_scan_string(stdlib_mask32_code);
+                break;
+            default:
+                FATAL("Unhandled mask bit size for stdlib.ispc");
+            }
         }
+        yyparse();
     }
 }
diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll
index dcbe0a66..1d317713 100644
--- a/builtins/target-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -277,3 +277,9 @@ define double @__max_uniform_double(double, double) nounwind readnone alwaysinli
   sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
   ret double %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index 8c6b7753..8fb2e427 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -134,23 +134,6 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
   ret <16 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -271,6 +254,33 @@ reduce_equal(16)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <16 x i32> @__add_varying_int32(<16 x i32>,
                                        <16 x i32>) nounwind readnone alwaysinline {
   %s = add <16 x i32> %0, %1
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index e6ab3a4b..adaed9ba 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -134,23 +134,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -217,7 +200,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
   ret float %sum
 }
 
-
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8(float, @__min_varying_float, @__min_uniform_float)
 }
@@ -229,6 +211,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
 
 reduce_equal(8)
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 
@@ -257,20 +275,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint32 ops
-
 define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 
-
 define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
   reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 
@@ -329,9 +341,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;;; horizontal uint64 ops
-
 define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
   reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 9b747e2e..3472c207 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -471,6 +471,15 @@ define  i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
   ret i64 %call
 }
 
+define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i8> %v, i32 0
+  ret i8 %r
+}
+
+define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
+  %r = extractelement <1 x i16> %v, i32 0
+  ret i16 %r
+}
 
 define  float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
   %r = extractelement <1 x float> %v, i32 0
@@ -638,104 +647,6 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
   
 }
 
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.sin.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float,@llvm.sin.f32)
-   
-}
-
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm.cos.f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  unary1to1(float, @llvm.cos.f32)
-
-}
-
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
-;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
-;  store <1 x float> %s, <1 x float> * %1
-;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
-   store <1 x float> %sin, <1 x float> * %1
-   store <1 x float> %cos, <1 x float> * %2
-   ret void
-}
-
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_tan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unasry1to1(float, @llvm.tan.f32)
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
-;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
-;  ret <1 x float> %ret
-  ;%r = extractelement <1 x float> %0, i32 0
-  ;%s = call float @llvm_atan_f32(float %r)
-  ;%rv = insertelement <1 x float> undef, float %r, i32 0
-  ;ret <1 x float> %rv
-  ;unsary1to1(float,@llvm.atan.f32)
-  ;UNSUPPORTED!
-  ret <1 x float > %0
-
-}
-
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  ;%y = extractelement <1 x float> %0, i32 0
-  ;%x = extractelement <1 x float> %1, i32 0
-  ;%q = fdiv float %y, %x
-  ;%a = call float @llvm.atan.f32 (float %q)
-  ;%rv = insertelement <1 x float> undef, float %a, i32 0
-  ;ret <1 x float> %rv
-  ; UNSUPPORTED!
-  ret <1 x float > %0
-}
-
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.exp.f32)
-}
-
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
-  ;ret <1 x float> %ret
-  unary1to1(float, @llvm.log.f32)
-}
-
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
-  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
-  ;ret <1 x float> %ret
-  %r = extractelement <1 x float> %0, i32 0
-  %e  = extractelement <1 x float> %1, i32 0
-  %s = call float @llvm.pow.f32(float %r,float %e)
-  %rv = insertelement <1 x float> undef, float %s, i32 0
-  ret <1 x float> %rv
-
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -953,3 +864,9 @@ declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bbf1b842..c683ff45 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,22 +202,6 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -226,14 +210,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
 declare i1 @__all(<WIDTH x i1>) nounwind readnone 
 declare i1 @__none(<WIDTH x i1>) nounwind readnone 
 
+declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
+declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
+
 declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
 declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
 declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
 
-declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
 declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
-
 declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
 declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
 
@@ -244,7 +230,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
 declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
-
 declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
 declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
 
@@ -379,3 +364,8 @@ declare void @__prefetch_read_uniform_2(i8 * nocapture) nounwind
 declare void @__prefetch_read_uniform_3(i8 * nocapture) nounwind 
 declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind 
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
new file mode 100644
index 00000000..a0575927
--- /dev/null
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,517 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_uint8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_down_int8(<8 x i8>, <8 x i8>) nounwind readnone {
+  %r = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %0, <8 x i8> %1)
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_uint16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_down_int16(<8 x i16>, <8 x i16>) nounwind readnone {
+  %r = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll
similarity index 60%
rename from builtins/target-neon.ll
rename to builtins/target-neon-32.ll
index e70b774b..30b062c9 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon-32.ll
@@ -1,5 +1,5 @@
 ;;
-;; target-neon.ll
+;; target-neon-32.ll
 ;;
 ;;  Copyright(c) 2012-2013 Matt Pharr
 ;;  Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
-
 define(`WIDTH',`4')
-
 define(`MASK',`i32')
 
 include(`util.m4')
-
-stdlib_core()
-scans()
-reduce_equal(WIDTH)
-rdrand_decls()
-define_shuffles()
-aossoa()
-ctlztz()
+include(`target-neon-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-
-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
-  %v1 = bitcast i16 %v to <1 x i16>
-  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
-  %r = extractelement <4 x float> %h, i32 0
-  ret float %r
-}
-
 define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
   %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
   ret <4 x float> %r
 }
 
-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
-  %v1 = bitcast float %v to <1 x float>
-  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
-  %r = extractelement <4 x i16> %h, i32 0
-  ret i16 %r
-}
-
-
 define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
   ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 
-define void @__fastmath() nounwind {
-  ret void
-}
-
 ;; round/floor/ceil
 
 ;; FIXME: grabbed these from the sse2 target, which does not have native
 ;; instructions for these.  Is there a better approach for NEON?
 
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
   %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
   %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
 }
 
 ;; FIXME: rounding doubles and double vectors needs to be implemented
-declare double @__round_uniform_double(double) nounwind readnone 
-declare double @__floor_uniform_double(double) nounwind readnone 
-declare double @__ceil_uniform_double(double) nounwind readnone 
-
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
-define float @__max_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ugt float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define float @__min_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ult float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp slt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp sgt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ult i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ugt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp slt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp sgt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ult i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ugt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp olt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp ogt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
   ret <4 x i32> %r
 }
 
-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp slt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp sgt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ult <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ugt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp olt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
-define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp ogt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
 ;; sqrt/rsqrt/rcp
 
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
   ret float %r
 }
 
-declare float @llvm.sqrt.f32(float)
-
-define float @__sqrt_uniform_float(float) nounwind readnone {
-  %r = call float @llvm.sqrt.f32(float %0)
-  ret float %r
-}
-
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   ret <4 x float> %result
 }
 
-declare double @llvm.sqrt.f64(double)
-
-define double @__sqrt_uniform_double(double) nounwind readnone {
-  %r = call double @llvm.sqrt.f64(double %0)
-  ret double %r
-}
-
 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
 define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
   ret <4 x double> %r
 }
 
-;; bit ops
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readnone {
-  %v = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %v
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone {
-  %v = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %v
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -509,15 +297,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
   neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
 }
 
-define internal i32 @add_i32(i32, i32) {
-  %r = add i32 %0, %1
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
   ret i32 %r
 }
 
-declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
 
-define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
-  neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
 }
 
 declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
@@ -617,90 +428,60 @@ define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
+;; int8/int16
 
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
+declare <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
 
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-masked_store_float_double()
-
-define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
-                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i8> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
-  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
-  ret void
+define <4 x i8> @__avg_up_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i16> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
-  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_up_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vrhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i32> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
-  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_uint8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhaddu.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
-                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i64> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
-  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
-  ret void
+declare <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8>, <4 x i8>) nounwind readnone
+
+define <4 x i8> @__avg_down_int8(<4 x i8>, <4 x i8>) nounwind readnone {
+  %r = call <4 x i8> @llvm.arm.neon.vhadds.v4i8(<4 x i8> %0, <4 x i8> %1)
+  ret <4 x i8> %r
 }
 
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+define <4 x i16> @__avg_up_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)
+define <4 x i16> @__avg_up_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-gen_scatter(i8)
-gen_scatter(i16)
-gen_scatter(i32)
-gen_scatter(float)
-gen_scatter(i64)
-gen_scatter(double)
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-packed_load_and_store(4)
+define <4 x i16> @__avg_down_uint16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
 
-define_prefetches()
+define <4 x i16> @__avg_down_int16(<4 x i16>, <4 x i16>) nounwind readnone {
+  %r = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %0, <4 x i16> %1)
+  ret <4 x i16> %r
+}
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
new file mode 100644
index 00000000..2accfe53
--- /dev/null
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,583 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_down_int8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_down_int16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
new file mode 100644
index 00000000..f892a0a1
--- /dev/null
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,338 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll
index c6a3afe2..ad1d88bc 100644
--- a/builtins/target-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -269,4 +269,8 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
   ret i64 %val
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
 
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 73361720..057ea98f 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -367,6 +281,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define <4 x float> @__vec4_add_float(<4 x float> %v0,
                                      <4 x float> %v1) nounwind readnone alwaysinline {
   %v = fadd <4 x float> %v0, %v1
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 2bb06391..e0a5c3d5 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
                       <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -463,66 +493,6 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
   ret <4 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
new file mode 100644
index 00000000..b4772552
--- /dev/null
+++ b/builtins/target-sse4-16.ll
@@ -0,0 +1,482 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`8')
+define(`MASK',`i16')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <8 x float> %0, %call
+  %two_minus = fsub <8 x float> <float 2., float 2., float 2., float 2.,
+                                 float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <8 x float> %call, %two_minus
+  ret <8 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to8(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <8 x float> %v, %is
+  %v_is_is = fmul <8 x float> %v_is, %is
+  %three_sub = fsub <8 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <8 x float> %is, %three_sub
+  %half_scale = fmul <8 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <8 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  unary4to8(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <8 x double> @__sqrt_varying_double(<8 x double>) nounwind
+alwaysinline {
+  unary2to8(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to8(%0, 8)
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to8(%0, 9)
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to8(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 9)
+}
+
+define <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <8 x float> @__max_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+define <8 x float> @__min_varying_float(<8 x float>, <8 x float>) nounwind readonly alwaysinline {
+  binary4to8(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <8 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <8 x i32> %call
+}
+
+define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
+  binary4to8(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <8 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x double> @__min_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind readnone {
+  binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <8 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<8 x MASK>) nounwind readnone alwaysinline {
+  %m8 = trunc <8 x MASK> %0 to <8 x i8>
+  %mask8 = shufflevector <8 x i8> %m8, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %mask8)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %mne = icmp ne i64 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
+  %m = call i64 @__movmsk(<8 x MASK> %0)
+  %meq = icmp eq i64 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
+  %r = fadd <8 x float> %0, %1
+  ret <8 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
+  reduce8(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<8 x float>) nounwind readnone {
+  reduce8(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <8 x i32> @__add_varying_int32(<8 x i32>, <8 x i32>) {
+  %r = add <8 x i32> %0, %1
+  ret <8 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone {
+  reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <8 x double> @__add_varying_double(<8 x double>, <8 x double>) {
+  %r = fadd <8 x double> %0, %1
+  ret <8 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>, <8 x i64>) {
+  %r = add <8 x i64> %0, %1
+  ret <8 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<8 x i64>* nocapture, <8 x i64>,
+                                      <8 x MASK> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i64>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i64> %1, <8 x i64> %old
+  store <8 x i64> %blend, <8 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<8 x i32>* nocapture, <8 x i32>, 
+                                      <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i32>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i32> %1, <8 x i32> %old
+  store <8 x i32> %blend, <8 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i16>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i16> %1, <8 x i16> %old
+  store <8 x i16> %blend, <8 x i16>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
+                                     <8 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <8 x MASK> %mask to <8 x i1>
+  %old = load <8 x i8>* %0, align 4
+  %blend = select <8 x i1> %mask_as_i1, <8 x i8> %1, <8 x i8> %old
+  store <8 x i8> %blend, <8 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @__avg_up_uint8(<8 x i8>, <8 x i8>) {
+  %v0 = shufflevector <8 x i8> %0, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v1 = shufflevector <8 x i8> %1, <8 x i8> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %r16 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %v0, <16 x i8> %v1)
+  %r = shufflevector <16 x i8> %r16, <16 x i8> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @__avg_up_uint16(<8 x i16>, <8 x i16>) {
+  %r = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %0, <8 x i16> %1)
+  ret <8 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
new file mode 100644
index 00000000..a75d8e3a
--- /dev/null
+++ b/builtins/target-sse4-8.ll
@@ -0,0 +1,483 @@
+;;  Copyright (c) 2013, Google, Inc.
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Google, Inc. nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; Define common 4-wide stuff
+define(`WIDTH',`16')
+define(`MASK',`i8')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.rcp.ps, %0)
+  ; do one N-R iteration to improve precision
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+  %v_iv = fmul <16 x float> %0, %call
+  %two_minus = fsub <16 x float> <float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.,
+                                  float 2., float 2., float 2., float 2.>, %v_iv  
+  %iv_mul = fmul <16 x float> %call, %two_minus
+  ret <16 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; rsqrt
+
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__rsqrt_varying_float(<16 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  unary4to16(is, float, @llvm.x86.sse.rsqrt.ps, %v)
+   ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <16 x float> %v, %is
+  %v_is_is = fmul <16 x float> %v_is, %is
+  %three_sub = fsub <16 x float> <float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.,
+                                  float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <16 x float> %is, %three_sub
+  %half_scale = fmul <16 x float> <float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5,
+                                   float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <16 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; sqrt
+
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  unary4to16(call, float, @llvm.x86.sse.sqrt.ps, %0)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+define <16 x double> @__sqrt_varying_double(<16 x double>) nounwind
+alwaysinline {
+  unary2to16(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  round4to16(%0, 8)
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  round4to16(%0, 9)
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  round4to16(%0, 10)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+
+define <16 x double> @__round_varying_double(<16 x double>) nounwind readonly alwaysinline {
+;  XXXround2to4double(%0, 8)
+  ; FIXME: need round2to16double in util.m4...
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__floor_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+;  XXXround2to4double(%0, 9)
+  ret <16 x double> undef  
+}
+
+define <16 x double> @__ceil_varying_double(<16 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+;  XXXround2to4double(%0, 10)
+  ret <16 x double> undef  
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <16 x float> @__max_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.max.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+define <16 x float> @__min_varying_float(<16 x float>, <16 x float>) nounwind readonly alwaysinline {
+  binary4to16(call, float, @llvm.x86.sse.min.ps, %0, %1)
+  ret <16 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
+
+define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unsigned int min/max
+
+define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pminud, %0, %1)
+  ret <16 x i32> %call
+}
+
+define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
+  binary4to16(call, i32, @llvm.x86.sse41.pmaxud, %0, %1)
+  ret <16 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+
+define <16 x double> @__min_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwind readnone {
+  binary2to16(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <16 x double> %ret
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops / reductions
+
+declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
+
+define i64 @__movmsk(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %m64 = zext i32 %m to i64
+  ret i64 %m64
+}
+
+define i1 @__any(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %mne = icmp ne i32 %m, 0
+  ret i1 %mne
+}
+
+define i1 @__all(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, ALL_ON_MASK
+  ret i1 %meq
+}
+
+define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
+  %m = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %0)
+  %meq = icmp eq i32 %m, 0
+  ret i1 %meq
+}
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <16 x i16> @__add_varying_i16(<16 x i16>,
+                                  <16 x i16>) nounwind readnone alwaysinline {
+  %r = add <16 x i16> %0, %1
+  ret <16 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
+  reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
+  %r = fadd <16 x float> %0, %1
+  ret <16 x float> %r
+}
+
+define internal float @__add_uniform_float(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define float @__reduce_add_float(<16 x float>) nounwind readonly alwaysinline {
+  reduce16(float, @__add_varying_float, @__add_uniform_float)
+}
+
+define float @__reduce_min_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<16 x float>) nounwind readnone {
+  reduce16(float, @__max_varying_float, @__max_uniform_float)
+}
+
+define internal <16 x i32> @__add_varying_int32(<16 x i32>, <16 x i32>) {
+  %r = add <16 x i32> %0, %1
+  ret <16 x i32> %r
+}
+
+define internal i32 @__add_uniform_int32(i32, i32) {
+  %r = add i32 %0, %1
+  ret i32 %r
+}
+
+define i32 @__reduce_add_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+define i32 @__reduce_min_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+define i32 @__reduce_max_int32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<16 x i32>) nounwind readnone {
+  reduce16(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+define internal <16 x double> @__add_varying_double(<16 x double>, <16 x double>) {
+  %r = fadd <16 x double> %0, %1
+  ret <16 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define double @__reduce_add_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<16 x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <16 x i64> @__add_varying_int64(<16 x i64>, <16 x i64>) {
+  %r = add <16 x i64> %0, %1
+  ret <16 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<16 x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+reduce_equal(16)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+define void @__masked_store_blend_i64(<16 x i64>* nocapture, <16 x i64>,
+                                      <16 x i8> %mask) nounwind
+                                      alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i64>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i64> %1, <16 x i64> %old
+  store <16 x i64> %blend, <16 x i64>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i32(<16 x i32>* nocapture, <16 x i32>, 
+                                      <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i32>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i32> %1, <16 x i32> %old
+  store <16 x i32> %blend, <16 x i32>* %0, align 4
+  ret void
+}
+
+define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %mask_as_i1 = trunc <16 x MASK> %mask to <16 x i1>
+  %old = load <16 x i16>* %0, align 4
+  %blend = select <16 x i1> %mask_as_i1, <16 x i16> %1, <16 x i16> %old
+  store <16 x i16> %blend, <16 x i16>* %0, align 4
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>,
+                                     <16 x MASK> %mask) nounwind alwaysinline {
+  %old = load <16 x i8>* %0, align 4
+  %blend = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %old, <16 x i8> %1,
+                                                   <16 x i8> %mask)
+  store <16 x i8> %blend, <16 x i8>* %0, align 4
+  ret void
+}
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+; define these with the macros from stdlib.m4
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <16 x i8> @__avg_up_uint8(<16 x i8>, <16 x i8>) nounwind readnone {
+  %r = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %0, <16 x i8> %1)
+  ret <16 x i8> %r
+}
+
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @__avg_up_uint16(<16 x i16>, <16 x i16>) nounwind readnone {
+  v16tov8(i16, %0, %a0, %b0)
+  v16tov8(i16, %1, %a1, %b1)
+  %r0 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  %r1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %b0, <8 x i16> %b1)
+  v8tov16(i16, %r0, %r1, %r)
+  ret <16 x i16> %r
+}
+
+define_avg_up_int8()
+define_avg_up_int16()
+define_down_avgs()
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index ccae4d51..897a09eb 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -102,92 +102,6 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
   ret <8 x float> %call
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincos(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_pow(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 
@@ -309,6 +223,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                  i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <8 x i16> @__add_varying_i16(<8 x i16>,
+                                  <8 x i16>) nounwind readnone alwaysinline {
+  %r = add <8 x i16> %0, %1
+  ret <8 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
+  reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
   reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
 }
@@ -629,3 +573,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
   binary2to8(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
   ret <8 x double> %ret
 }
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index f622b839..5429b461 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -206,66 +206,6 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
   ret <4 x double> %ret
 }
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-; svml stuff
-
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
@@ -299,6 +239,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
   ret i1 %cmp
 }
 
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
@@ -503,3 +473,9 @@ gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define_avgs()
+
diff --git a/builtins/util.m4 b/builtins/util.m4
index c19d4930..95e3844d 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,63 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;;
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; vector assembly: wider vector from two narrower vectors
+;;
+;; $1: vector element type
+;; $2: first n-wide vector
+;; $3: second n-wide vector
+;; $4: result 2*n-wide vector
+define(`v8tov16', `
+  $4 = shufflevector <8 x $1> $2, <8 x $1> $3,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -156,10 +213,7 @@ define(`reduce16', `
 ;;     the final reduction
 
 define(`reduce8by4', `
-  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  v8tov4($1, %0, %v1, %v2)
   %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
   %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
         <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +320,66 @@ define(`binary2to4', `
 ;; $4: 8-wide operand value
 
 define(`unary4to8', `
-  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 '
 )
 
 define(`unary4to16', `
-  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
-  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
 
-  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
            <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
@@ -411,6 +501,42 @@ define(`unary2to8', `
 '
 )
 
+define(`unary2to16', `
+  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3)
+  %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4)
+  %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5)
+  %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6)
+  %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7)
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
 ;; Maps an 2-wide binary function to two 8-wide vector operands
 ;; $1: name of variable into which the final result should go
 ;; $2: scalar type of the vector elements
@@ -432,12 +558,58 @@ define(`binary2to8', `
   %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> <i32 6, i32 7>
   %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
 
+  %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+define(`binary2to16', `
+  %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 0, i32 1>
+  %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b)
+  %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 2, i32 3>
+  %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b)
+  %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 4, i32 5>
+  %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b)
+  %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 6, i32 7>
+  %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b)
+  %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 8, i32 9>
+  %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b)
+  %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 10, i32 11>
+  %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b)
+  %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 12, i32 13>
+  %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b)
+  %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> <i32 14, i32 15>
+  %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b)
+
   %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, 
            <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
+  %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b,
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>           
+
+  %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7,
+           <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d,
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+  %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
 )
 
@@ -460,6 +632,26 @@ ret <8 x float> %ret
 '
 )
 
+define(`round4to16', `
+%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2)
+%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2)
+%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2)
+%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2)
+%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3,
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23,
+         <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                     i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ret <16 x float> %ret
+'
+)
+
 define(`round8to16', `
 %v0 = shufflevector <16 x float> $1, <16 x float> undef,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -690,6 +882,75 @@ shuffles(i64, 8)
 ;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32)
 ;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...)
 
+define(`mask_converts', `
+define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) {
+  %r = sext <$1 x i1> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) {
+  ret <$1 x i8> %0
+}
+define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) {
+  %r = sext <$1 x i8> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) {
+  %r = trunc <$1 x i16> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) {
+  ret <$1 x i16> %0
+}
+define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) {
+  %r = sext <$1 x i16> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+
+define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) {
+  %r = trunc <$1 x i32> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) {
+  ret <$1 x i32> %0
+}
+define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
+  %r = sext <$1 x i32> %0 to <$1 x i64>
+  ret <$1 x i64> %r
+}
+')
+
+mask_converts(WIDTH)
+
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
@@ -697,17 +958,10 @@ define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
-  ; for the bit tricks below, we need the mask to be sign extended to be
-  ; the size of the element type.
-  ifelse(
-    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
-    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
-    $3,i32, `
-       ; silly workaround to do %mask = %m, which is not possible directly..
-       %maskmem = alloca <$1 x i32>
-       store <$1 x i32> %m, <$1 x i32> * %maskmem
-       %mask = load <$1 x i32> * %maskmem'
-  )
+  ; for the bit tricks below, we need the mask to have the
+  ; the same element size as the element type.
+  %mask = call <$1 x $3> @convertmask_`'MASK`'_$3_$1(<$1 x MASK> %m)
+
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
 
@@ -1551,11 +1805,6 @@ declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_uniform_int32(i32)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
-define void @__pause() nounwind readnone {
-  call void asm sideeffect "pause", "~{dirflag},~{fpsr},~{flags}"() nounwind
-  ret void
-}
-
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
 ;
@@ -2440,13 +2689,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i1, `
-  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se
-  ', `
-  ret <WIDTH x i32> %0')
+  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %se')
 }
 
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; memcpy/memmove/memset
 
@@ -2830,17 +3078,11 @@ m4exit(`1')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; read hw clock
 
+declare i64 @llvm.readcyclecounter()
+
 define i64 @__clock() nounwind {
-entry:
-  tail call void asm sideeffect "xorl %eax,%eax \0A    cpuid", "~{rax},~{rbx},~{rcx},~{rdx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %0 = tail call { i32, i32 } asm sideeffect "rdtsc", "={ax},={dx},~{dirflag},~{fpsr},~{flags}"() nounwind
-  %asmresult = extractvalue { i32, i32 } %0, 0
-  %asmresult1 = extractvalue { i32, i32 } %0, 1
-  %conv = zext i32 %asmresult1 to i64
-  %shl = shl nuw i64 %conv, 32
-  %conv2 = zext i32 %asmresult to i64
-  %or = or i64 %shl, %conv2
-  ret i64 %or
+  %r = call i64 @llvm.readcyclecounter()
+  ret i64 %r
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -3201,8 +3443,8 @@ return:
 ;; $1: llvm type of elements (and suffix for function name)
 
 define(`gen_masked_store', `
-define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x i32>) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %2, `
+define void @__masked_store_$1(<WIDTH x $1>* nocapture, <WIDTH x $1>, <WIDTH x MASK>) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %2, `
       %ptr_LANE_ID = getelementptr <WIDTH x $1> * %0, i32 0, i32 LANE
       %storeval_LANE_ID = extractelement <WIDTH x $1> %1, i32 LANE
       store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID')
@@ -3378,10 +3620,10 @@ define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>,
 define(`packed_load_and_store', `
 
 define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
-                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                 <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3432,10 +3674,10 @@ done:
 }
 
 define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
-                                   <WIDTH x i32> %full_mask) nounwind alwaysinline {
+                                   <WIDTH x MASK> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i64 @__movmsk(<WIDTH x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
+  %mask = call i64 @__movmsk(<WIDTH x MASK> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
@@ -3544,10 +3786,10 @@ check_neighbors:
   %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1)
   %vr = bitcast <$1 x $4> %castvr to <$1 x $2>
   %eq = $5 $7 <$1 x $2> %vec, %vr
-  ifelse(MASK,i32, `
-    %eq32 = sext <$1 x i1> %eq to <$1 x i32>
-    %eqmm = call i64 @__movmsk(<$1 x i32> %eq32)', `
-    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)')
+  ifelse(MASK,i1, `
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)',
+    `%eqm = sext <$1 x i1> %eq to <$1 x MASK>
+    %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)')
   %alleq = icmp eq i64 %eqmm, ALL_ON_MASK
   br i1 %alleq, label %all_equal, label %not_all_equal
   ', `
@@ -3722,9 +3964,9 @@ pl_done:
 define(`gen_gather_general', `
 ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3738,9 +3980,9 @@ define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
 
 ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
 define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
-                                   <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                   <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %ret_ptr = alloca <WIDTH x $1>
-  per_lane(WIDTH, <WIDTH x i32> %vecmask, `
+  per_lane(WIDTH, <WIDTH x MASK> %vecmask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = load $1 * %ptr_LANE_ID
@@ -3804,7 +4046,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
 
 define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
                                              <WIDTH x i32> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3813,13 +4055,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
   %offsetsPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %offsetsPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %offsetsPtr, <WIDTH x i32> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i32> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i32>
   store <WIDTH x i32> zeroinitializer, <WIDTH x i32> * %deltaPtr
   call void @__masked_store_blend_i32(<WIDTH x i32> * %deltaPtr, <WIDTH x i32> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i32> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %newOffsets,
@@ -3835,7 +4077,7 @@ define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32
 
 define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
                                              <WIDTH x i64> %offset_delta,
-                                             <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                                             <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -3844,13 +4086,13 @@ define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64
   %offsetsPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %offsetsPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %offsetsPtr, <WIDTH x i64> %offsets, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newOffsets = load <WIDTH x i64> * %offsetsPtr
 
   %deltaPtr = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %deltaPtr
   call void @__masked_store_blend_i64(<WIDTH x i64> * %deltaPtr, <WIDTH x i64> %offset_delta, 
-                                      <WIDTH x i32> %vecmask)
+                                      <WIDTH x MASK> %vecmask)
   %newDelta = load <WIDTH x i64> * %deltaPtr
 
   %ret0 = call <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %newOffsets,
@@ -3876,27 +4118,27 @@ gen_gather_factored($1)
 define <WIDTH x $1>
 @__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
                            <WIDTH x i32> %offsets,
-                           <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                           <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale_vec = bitcast i32 %offset_scale to <1 x i32>
   %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1, 
-                                                     <WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     <WIDTH x i32> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
 define <WIDTH x $1>
 @__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
                             <WIDTH x i64> %offsets,
-                            <WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
+                            <WIDTH x MASK> %vecmask) nounwind readonly alwaysinline {
   %scale64 = zext i32 %offset_scale to i64
   %scale_vec = bitcast i64 %scale64 to <1 x i64>
   %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
      <WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
   %scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
   %v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
-                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
+                                                     i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x MASK> %vecmask)
   ret <WIDTH x $1> %v
 }
 
@@ -3955,9 +4197,9 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
 
 define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                          <WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt32_$1(i8 * %base, <WIDTH x i32> %offsets, i32 %offset_scale,
                                     <WIDTH x i32> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3965,9 +4207,9 @@ define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offs
 
 define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                          <WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
-                                         <WIDTH x i32> %mask) nounwind alwaysinline {
+                                         <WIDTH x MASK> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
       call void @__scatter_elt64_$1(i8 * %base, <WIDTH x i64> %offsets, i32 %offset_scale,
                                     <WIDTH x i64> %offset_delta, <WIDTH x $1> %values, i32 LANE)')
   ret void
@@ -3975,8 +4217,8 @@ define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offs
 
 ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
 define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -3987,8 +4229,8 @@ define void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
 
 ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
 define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
-                            <WIDTH x i32> %mask) nounwind alwaysinline {
-  per_lane(WIDTH, <WIDTH x i32> %mask, `
+                            <WIDTH x MASK> %mask) nounwind alwaysinline {
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
   %iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
   %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
   %val_LANE_ID = extractelement <WIDTH x $1> %values, i32 LANE
@@ -4044,3 +4286,109 @@ define i1 @__rdrand_i64(i64 * %ptr) {
   ret i1 %good
 }
 ')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int8/int16 builtins
+
+define(`define_avg_up_uint8', `
+define <WIDTH x i8> @__avg_up_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_int8', `
+define <WIDTH x i8> @__avg_up_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum1 = add <WIDTH x i16> %a16, %b16
+  %sum = add <WIDTH x i16> %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_up_uint16', `
+define <WIDTH x i16> @__avg_up_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_up_int16', `
+define <WIDTH x i16> @__avg_up_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum1 = add <WIDTH x i32> %a32, %b32
+  %sum = add <WIDTH x i32> %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_uint8', `
+define <WIDTH x i8> @__avg_down_uint8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = zext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = zext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = lshr <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_int8', `
+define <WIDTH x i8> @__avg_down_int8(<WIDTH x i8>, <WIDTH x i8>) {
+  %a16 = sext <WIDTH x i8> %0 to <WIDTH x i16>
+  %b16 = sext <WIDTH x i8> %1 to <WIDTH x i16>
+  %sum = add <WIDTH x i16> %a16, %b16
+  %avg = sdiv <WIDTH x i16> %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 >
+  %r = trunc <WIDTH x i16> %avg to <WIDTH x i8>
+  ret <WIDTH x i8> %r
+}')
+
+define(`define_avg_down_uint16', `
+define <WIDTH x i16> @__avg_down_uint16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = zext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = zext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = lshr <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_avg_down_int16', `
+define <WIDTH x i16> @__avg_down_int16(<WIDTH x i16>, <WIDTH x i16>) {
+  %a32 = sext <WIDTH x i16> %0 to <WIDTH x i32>
+  %b32 = sext <WIDTH x i16> %1 to <WIDTH x i32>
+  %sum = add <WIDTH x i32> %a32, %b32
+  %avg = sdiv <WIDTH x i32> %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 >
+  %r = trunc <WIDTH x i32> %avg to <WIDTH x i16>
+  ret <WIDTH x i16> %r
+}')
+
+define(`define_up_avgs', `
+define_avg_up_uint8()
+define_avg_up_int8()
+define_avg_up_uint16()
+define_avg_up_int16()
+')
+
+define(`define_down_avgs', `
+define_avg_down_uint8()
+define_avg_down_int8()
+define_avg_down_uint16()
+define_avg_down_int16()
+')
+
+define(`define_avgs', `
+define_up_avgs()
+define_down_avgs()
+')
diff --git a/cbackend.cpp b/cbackend.cpp
index d23bcc20..7d4b4cfc 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -3704,6 +3704,7 @@ void CWriter::lowerIntrinsics(llvm::Function &F) {
           case llvm::Intrinsic::sadd_with_overflow:
           case llvm::Intrinsic::trap:
           case llvm::Intrinsic::objectsize:
+          case llvm::Intrinsic::readcyclecounter:
               // We directly implement these intrinsics
             break;
           default:
@@ -4056,6 +4057,9 @@ bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
     return true;
   case llvm::Intrinsic::objectsize:
     return true;
+  case llvm::Intrinsic::readcyclecounter:
+    Out << "__clock()";
+    return true;
   }
 }
 
diff --git a/ctx.cpp b/ctx.cpp
index 1e79c97b..c50d22f9 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1456,13 +1456,13 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         for (unsigned int i = 0; i < at->getNumElements(); ++i) {
             llvm::Value *elt = ExtractInst(b, i);
             llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType,
-                                         LLVMGetName(elt, "_to_boolvec32"));
+                                         LLVMGetName(elt, "_to_boolvec"));
             ret = InsertInst(ret, sext, i);
         }
         return ret;
     }
     else
-        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_i32"));
+        return SExtInst(b, LLVMTypes::BoolVectorType, LLVMGetName(b, "_to_boolvec"));
 }
 
 
@@ -2781,6 +2781,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
 
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
     llvm::Function *maskedStoreFunc = NULL;
+    llvm::Type *llvmValueType = value->getType();
 
     const PointerType *pt = CastType<PointerType>(valueType);
     if (pt != NULL) {
@@ -2809,8 +2810,7 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         else
             maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) &&
-             g->target->getMaskBitCount() == 1) {
+    else if (llvmValueType == LLVMTypes::Int1VectorType) {
         llvm::Value *notMask = BinaryOperator(llvm::Instruction::Xor, mask,
                                               LLVMMaskAllOn, "~mask");
         llvm::Value *old = LoadInst(ptr);
@@ -2823,28 +2823,22 @@ FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
         StoreInst(final, ptr);
         return;
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingDouble)) {
+    else if (llvmValueType == LLVMTypes::DoubleVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_double");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt64) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt64)) {
+    else if (llvmValueType == LLVMTypes::Int64VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i64");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingFloat)) {
+    else if (llvmValueType == LLVMTypes::FloatVectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_float");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingBool) ||
-             Type::Equal(valueType, AtomicType::VaryingInt32) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt32) ||
-             CastType<EnumType>(valueType) != NULL) {
+    else if (llvmValueType == LLVMTypes::Int32VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i32");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt16) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt16)) {
+    else if (llvmValueType == LLVMTypes::Int16VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i16");
     }
-    else if (Type::Equal(valueType, AtomicType::VaryingInt8) ||
-             Type::Equal(valueType, AtomicType::VaryingUInt8)) {
+    else if (llvmValueType == LLVMTypes::Int8VectorType) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_i8");
     }
     AssertPos(currentPos, maskedStoreFunc != NULL);
diff --git a/docs/build.sh b/docs/build.sh
index a13f3231..4f4fbfe4 100755
--- a/docs/build.sh
+++ b/docs/build.sh
@@ -1,14 +1,16 @@
 #!/bin/bash
 
+rst2html=rst2html.py
+
 for i in ispc perfguide faq; do
-    rst2html --template=template.txt --link-stylesheet \
+    $rst2html --template=template.txt --link-stylesheet \
         --stylesheet-path=css/style.css $i.rst > $i.html
 done
 
-rst2html --template=template-news.txt --link-stylesheet \
+$rst2html --template=template-news.txt --link-stylesheet \
     --stylesheet-path=css/style.css news.rst > news.html
 
-rst2html --template=template-perf.txt --link-stylesheet \
+$rst2html --template=template-perf.txt --link-stylesheet \
         --stylesheet-path=css/style.css perf.rst > perf.html
 
 #rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex
diff --git a/docs/ispc.rst b/docs/ispc.rst
index c6c63172..26cf6be3 100755
--- a/docs/ispc.rst
+++ b/docs/ispc.rst
@@ -467,45 +467,100 @@ There are three options that affect the compilation target: ``--arch``,
 which sets the target architecture, ``--cpu``, which sets the target CPU,
 and ``--target``, which sets the target instruction set.
 
-By default, the ``ispc`` compiler generates code for the 64-bit x86-64
-architecture (i.e. ``--arch=x86-64``.)  To compile to a 32-bit x86 target,
-supply ``--arch=x86`` on the command line:
+If none of these options is specified, ``ispc`` generates code for the
+architecture of the system the compiler is running on (i.e. 64-bit x86-64
+(``--arch=x86-64``) on x86 systems and ARM NEON on ARM systems.
+
+To compile to a 32-bit x86 target, for example, supply ``--arch=x86`` on
+the command line:
 
 ::
 
    ispc foo.ispc -o foo.obj --arch=x86
 
-No other architectures are currently supported.
+Currently-supported architectures are ``x86-64``, ``x86``, and ``arm``.
 
 The target CPU determines both the default instruction set used as well as
 which CPU architecture the code is tuned for.  ``ispc --help`` provides a
-list of a number of the supported CPUs.  By default, the CPU type of the
-system on which you're running ``ispc`` is used to determine the target
-CPU.
+list of all of the supported CPUs.  By default, the CPU type of the system
+on which you're running ``ispc`` is used to determine the target CPU.
 
 ::
 
    ispc foo.ispc -o foo.obj --cpu=corei7-avx
 
-Finally, ``--target`` selects between the SSE2, SSE4, and AVX, and AVX2
-instruction sets.  (As general context, SSE2 was first introduced in
-processors that shipped in 2001, SSE4 was introduced in 2007, and
-processors with AVX were introduced in 2010.  AVX2 will be supported on
-future CPUs based on Intel's "Haswell" architecture.  Consult your CPU's
-manual for specifics on which vector instruction set it supports.)
+Finally, ``--target`` selects the target instruction set.  The target
+string is of the form ``[ISA]-i[mask size]x[gang size]``.  For example,
+``--target=avx2-i32x16`` specifies a target with the AVX2 instruction set,
+a mask size of 32 bits, and a gang size of 16.
+
+The following target ISAs are supported:
+
+============ ==========================================
+Target       Description
+------------ ------------------------------------------
+avx, avx1    AVX (2010-2011 era Intel CPUs)
+avx1.1       AVX 1.1 (2012 era "Ivybridge" Intel CPUs)
+avx2         AVX 2 target (2013- Intel "Haswell" CPUs)
+neon         ARM NEON
+sse2         SSE2 (early 2000s era x86 CPUs)
+sse4         SSE4 (generally 2008-2010 Intel CPUs)
+============ ==========================================
+
+Consult your CPU's manual for specifics on which vector instruction set it
+supports.
+
+The mask size may be 8, 16, or 32 bits, though not all combinations of ISAs
+and mask sizes are supported.  For best performance, the best general
+approach is to choose a mask size equal to the size of the most common
+datatype in your programs.  For example, if most of your computation is on
+32-bit floating-point values, an ``i32`` target is appropriate.  However,
+if you're mostly doing computation on 8-bit images, ``i8`` is a better choice.
+
+See `Basic Concepts: Program Instances and Gangs of Program Instances`_ for
+more discussion of the "gang size" and its implications for program
+execution.
+
+Running ``ispc --help`` and looking at the output for the ``--target``
+option gives the most up-to-date documentation about which targets your
+compiler binary supports.
+
+The naming scheme for compilation targets changed in August 2013; the
+following table shows the relationship between names in the old scheme and
+in the new scheme:
+
+============= ===========
+Target        Former Name
+------------- -----------
+avx1-i32x8    avx, avx1
+avx1-i32x16   avx-x2
+avx1.1-i32x8  avx1.1
+avx1.1-i32x16 avx1.1-x2
+avx2-i32x8    avx2
+avx2-i32x16   avx2-x2
+neon-8        n/a
+neon-16       n/a
+neon-32       n/a
+sse2-i32x4    sse2
+sse2-i32x8    sse2-x2
+sse4-i32x4    sse4
+sse4-i32x8    sse4-x2
+sse4-i8x16    n/a
+sse4-i16x8    n/a
+============= ===========
 
 By default, the target instruction set is chosen based on the most capable
 one supported by the system on which you're running ``ispc``.  You can
 override this choice with the ``--target`` flag; for example, to select
-Intel® SSE2, use ``--target=sse2``.  (As with the other options in this
-section, see the output of ``ispc --help`` for a full list of supported
-targets.)
+Intel® SSE2 with a 32-bit mask and 4 program instances in a gang, use
+``--target=sse2-i32x4``.  (As with the other options in this section, see
+the output of ``ispc --help`` for a full list of supported targets.)
 
 Generating Generic C++ Output
 -----------------------------
 
 In addition to generating object files or assembly output for specific
-targets like SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
+targets like NEON, SSE2, SSE4, and AVX, ``ispc`` provides an option to generate
 "generic" C++ output.  This
 
 As an example, consider the following simple ``ispc`` program:
@@ -659,7 +714,7 @@ preprocessor runs:
   * - ISPC
     - 1
     - Detecting that the ``ispc`` compiler is processing the file
-  * - ISPC_TARGET_{SSE2,SSE4,AVX,AVX2}
+  * - ISPC_TARGET_{NEON_8,NEON_16,NEON_32,SSE2,SSE4,AVX,AVX11,AVX2,GENERIC}
     - 1
     - One of these will be set, depending on the compilation target.
   * - ISPC_POINTER_SIZE
@@ -3278,9 +3333,6 @@ for this argument.
   approximately 1.45e-6 over the range -10pi to 10pi.)
 * ``fast``: more efficient but lower accuracy versions of the default ``ispc``
   implementations.
-* ``svml``: use Intel "Short Vector Math Library".  Use
-  ``icc`` to link your final executable so that the appropriate libraries
-  are linked.
 * ``system``: use the system's math library.  On many systems, these
   functions are more accurate than both of ``ispc``'s implementations.
   Using these functions may be quite
@@ -3365,6 +3417,31 @@ The ``isnan()`` functions test whether the given value is a floating-point
     uniform bool isnan(uniform double v)
 
 
+A number of functions are also available for performing operations on 8- and
+16-bit quantities; these map to specialized instructions that perform these
+operations on targets that support them.  ``avg_up()`` computes the average
+of the two values, rounding up if their average is halfway between two
+integers (i.e., it computes ``(a+b+1)/2``).
+
+::
+
+   int8 avg_up(int8 a, int8 b)
+   unsigned int8 avg_up(unsigned int8 a, unsigned int8 b)
+   int16 avg_up(int16 a, int16 b)
+   unsigned int16 avg_up(unsigned int16 a, unsigned int16 b)
+
+
+``avg_down()`` computes the average of the two values, rounding down (i.e.,
+it computes ``(a+b)/2``).
+
+::
+
+   int8 avg_down(int8 a, int8 b)
+   unsigned int8 avg_down(unsigned int8 a, unsigned int8 b)
+   int16 avg_down(int16 a, int16 b)
+   unsigned int16 avg_down(unsigned int16 a, unsigned int16 b)
+
+
 Transcendental Functions
 ------------------------
 
@@ -3711,29 +3788,44 @@ instances are added together by the ``reduce_add()`` function.
 
 ::
 
-    uniform float reduce_add(float x)
-    uniform int reduce_add(int x)
-    uniform unsigned int reduce_add(unsigned int x)
+    uniform int16 reduce_add(int8 x)
+    uniform unsigned int16 reduce_add(unsigned int8 x)
+    uniform int32 reduce_add(int16 x)
+    uniform unsigned int32 reduce_add(unsigned 16int x)
+    uniform int64 reduce_add(int32 x)
+    uniform unsigned int64 reduce_add(unsigned int32 x)
+    uniform int64 reduce_add(int64 x)
+    uniform unsigned int64 reduce_add(unsigned int64 x)
 
-You can also use functions to compute the minimum and maximum value of the
-given value across all of the currently-executing program instances.
+    uniform float reduce_add(float x)
+    uniform double reduce_add(double x)
+
+You can also use functions to compute the minimum value of the given value
+across all of the currently-executing program instances.
 
 ::
 
-    uniform float reduce_min(float a)
     uniform int32 reduce_min(int32 a)
     uniform unsigned int32 reduce_min(unsigned int32 a)
-    uniform double reduce_min(double a)
     uniform int64 reduce_min(int64 a)
     uniform unsigned int64 reduce_min(unsigned int64 a)
 
-    uniform float reduce_max(float a)
+    uniform float reduce_min(float a)
+    uniform double reduce_min(double a)
+
+Equivalent functions are available to comptue the maximum of the given
+varying variable over the active program instances.
+
+::
+
     uniform int32 reduce_max(int32 a)
     uniform unsigned int32 reduce_max(unsigned int32 a)
-    uniform double reduce_max(double a)
     uniform int64 reduce_max(int64 a)
     uniform unsigned int64 reduce_max(unsigned int64 a)
 
+    uniform float reduce_max(float a)
+    uniform double reduce_max(double a)
+
 Finally, you can check to see if a particular value has the same value in
 all of the currently-running program instances:
 
@@ -3741,9 +3833,10 @@ all of the currently-running program instances:
 
     uniform bool reduce_equal(int32 v)
     uniform bool reduce_equal(unsigned int32 v)
-    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(int64 v)
     uniform bool reduce_equal(unsigned int64 v)
+
+    uniform bool reduce_equal(float v)
     uniform bool reduce_equal(double)
 
 There are also variants of these functions that return the value as a
@@ -3758,10 +3851,11 @@ performance in the `Performance Guide`_.
     uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval)
     uniform bool reduce_equal(unsigned int32 v,
                               uniform unsigned int32 * uniform sameval)
-    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval)
     uniform bool reduce_equal(unsigned int64 v,
                               uniform unsigned int64 * uniform sameval)
+
+    uniform bool reduce_equal(float v, uniform float * uniform sameval)
     uniform bool reduce_equal(double, uniform double * uniform sameval)
 
 If called when none of the program instances are running,
diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h
index 828c1ab4..d81101f7 100644
--- a/examples/intrinsics/generic-16.h
+++ b/examples/intrinsics/generic-16.h
@@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
+REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)
 
@@ -1758,3 +1759,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+
+#endif // !WIN32
+
diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h
index 64b82cb1..531ed215 100644
--- a/examples/intrinsics/generic-32.h
+++ b/examples/intrinsics/generic-32.h
@@ -408,15 +408,15 @@ static FORCEINLINE uint64_t __movmsk(__vec32_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec32_i1 __any(__vec32_i1 mask) {
+static FORCEINLINE bool __any(__vec32_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec32_i1 __all(__vec32_i1 mask) {
-    return (mask.v==0xFFFFFFFF);
+static FORCEINLINE bool __all(__vec32_i1 mask) {
+    return (mask.v==0xFFFFFFFFul);
 }
 
-static FORCEINLINE __vec32_i1 __none(__vec32_i1 mask) {
+static FORCEINLINE bool __none(__vec32_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)
 
@@ -1826,3 +1827,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // WIN32
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h
index 7869faa5..bbeb007a 100644
--- a/examples/intrinsics/generic-64.h
+++ b/examples/intrinsics/generic-64.h
@@ -533,15 +533,15 @@ static FORCEINLINE uint64_t __movmsk(__vec64_i1 mask) {
     return (uint64_t)mask.v;
 }
 
-static FORCEINLINE __vec64_i1 __any(__vec64_i1 mask) {
+static FORCEINLINE bool __any(__vec64_i1 mask) {
     return (mask.v!=0);
 }
 
-static FORCEINLINE __vec64_i1 __all(__vec64_i1 mask) {
-    return (mask.v==0xFFFFFFFFFFFFFFFF);
+static FORCEINLINE bool __all(__vec64_i1 mask) {
+    return (mask.v==0xFFFFFFFFFFFFFFFFull);
 }
 
-static FORCEINLINE __vec64_i1 __none(__vec64_i1 mask) {
+static FORCEINLINE bool __none(__vec64_i1 mask) {
     return (mask.v==0);
 }
 
@@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
 REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
 REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
+//REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
+//REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
+
+REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
 REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
 
-REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
 REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
+REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
 REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
 
-REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
 REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)
 
@@ -1959,3 +1960,23 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
     return __sync_val_compare_and_swap(p, cmpval, newval);
 #endif
 }
+
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif
+
+#undef FORCEINLINE
diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h
index bf383c88..8baef8cb 100644
--- a/examples/intrinsics/knc.h
+++ b/examples/intrinsics/knc.h
@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
 // reductions
 ///////////////////////////////////////////////////////////////////////////
 
+static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
+  // TODO: improve this!
+  int16_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
+  // TODO: improve this!
+  int32_t ret = 0;
+  for (int i = 0; i < 16; ++i)
+    ret += v.v[i];
+  return ret;
+}
+
 static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
     return _mm512_reduce_add_epi32(v);
 }
@@ -2105,9 +2121,24 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
-
-
diff --git a/examples/intrinsics/knc2x.h b/examples/intrinsics/knc2x.h
index 0041a6c9..a1b1fc9d 100644
--- a/examples/intrinsics/knc2x.h
+++ b/examples/intrinsics/knc2x.h
@@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8)
+REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16)
+
 static FORCEINLINE float __reduce_add_float(__vec32_f v) {
     return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2);
 }
@@ -2052,7 +2055,24 @@ static FORCEINLINE void __aos_to_soa4_float(float *ptr, __vec32_f *out0, __vec32
 }
 */
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
 #undef PRE_ALIGN
 #undef POST_ALIGN
-
diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h
index d4739d61..ff00d920 100644
--- a/examples/intrinsics/sse4.h
+++ b/examples/intrinsics/sse4.h
@@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
 ///////////////////////////////////////////////////////////////////////////
 // reductions
 
+static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
+    // TODO: improve
+    int16_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += __extract_element(v, i);
+    return ret;
+}
+
+static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
+    // TODO: improve
+    int32_t ret = 0;
+    for (int i = 0; i < 4; ++i)
+        ret += __extract_element(v, i);
+    return ret;
+}
+
 static FORCEINLINE float __reduce_add_float(__vec4_f v) {
     float r = bits_as_float(_mm_extract_ps(v.v, 0));
     r += bits_as_float(_mm_extract_ps(v.v, 1));
@@ -3984,6 +4000,22 @@ static FORCEINLINE uint64_t __atomic_cmpxchg(uint64_t *p, uint64_t cmpval,
 #endif
 }
 
+#ifdef WIN32
+#include <windows.h>
+#define __clock __rdtsc
+#else // WIN32
+static FORCEINLINE uint64_t __clock() {
+  uint32_t low, high;
+#ifdef __x86_64
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%rax", "%rbx", "%rcx", "%rdx" );
+#else
+  __asm__ __volatile__ ("xorl %%eax,%%eax \n    cpuid"
+                        ::: "%eax", "%ebx", "%ecx", "%edx" );
+#endif
+  __asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
+  return (uint64_t)high << 32 | low;
+}
+#endif // !WIN32
+
 #undef FORCEINLINE
-
-
diff --git a/expr.cpp b/expr.cpp
index fc3d295a..614cb5e5 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -1911,6 +1911,40 @@ lEmitLogicalOp(BinaryExpr::Op op, Expr *arg0, Expr *arg1,
 }
 
 
+/* Returns true if shifting right by the given amount will lead to
+   inefficient code.  (Assumes x86 target.  May also warn inaccurately if
+   later optimization simplify the shift amount more than we are able to
+   see at this point.) */
+static bool
+lIsDifficultShiftAmount(Expr *expr) {
+    // Uniform shifts (of uniform values) are no problem.
+    if (expr->GetType()->IsVaryingType() == false)
+        return false;
+
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(expr);
+    if (ce) {
+        // If the shift is by a constant amount, *and* it's the same amount
+        // in all vector lanes, we're in good shape.
+        uint32_t amount[ISPC_MAX_NVEC];
+        int count = ce->GetValues(amount);
+        for (int i = 1; i < count; ++i)
+            if (amount[i] != amount[0])
+              return true;
+        return false;
+    }
+
+    TypeCastExpr *tce = dynamic_cast<TypeCastExpr *>(expr);
+    if (tce && tce->expr) {
+        // Finally, if the shift amount is given by a uniform value that's
+        // been smeared out into a varying, we have the same shift for all
+        // lanes and are also in good shape.
+        return (tce->expr->GetType()->IsUniformType() == false);
+    }
+
+    return true;
+}
+
+
 llvm::Value *
 BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1) {
@@ -1951,9 +1985,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     case BitAnd:
     case BitXor:
     case BitOr: {
-        if (op == Shr && arg1->GetType()->IsVaryingType() &&
-            dynamic_cast<ConstExpr *>(arg1) == NULL)
-            PerformanceWarning(pos, "Shift right is extremely inefficient for "
+        if (op == Shr && lIsDifficultShiftAmount(arg1))
+            PerformanceWarning(pos, "Shift right is inefficient for "
                                "varying shift amounts.");
         return lEmitBinaryBitOp(op, value0, value1,
                                 arg0->GetType()->IsUnsignedType(), ctx);
@@ -2207,6 +2240,49 @@ lConstFoldBinaryIntOp(ConstExpr *constArg0, ConstExpr *constArg1,
 }
 
 
+/* Returns true if the given arguments (which are assumed to be the
+   operands of a divide) represent a divide that can be performed by one of
+   the __fast_idiv functions.
+ */
+static bool
+lCanImproveVectorDivide(Expr *arg0, Expr *arg1, int *divisor) {
+    const Type *type = arg0->GetType();
+    if (!type)
+        return false;
+
+    // The value being divided must be an int8/16/32.
+    if (!(Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt16) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingInt32) ||
+          Type::EqualIgnoringConst(type, AtomicType::VaryingUInt32)))
+        return false;
+
+    // The divisor must be the same compile-time constant value for all of
+    // the vector lanes.
+    ConstExpr *ce = dynamic_cast<ConstExpr *>(arg1);
+    if (!ce)
+        return false;
+    int64_t div[ISPC_MAX_NVEC];
+    int count = ce->GetValues(div);
+    for (int i = 1; i < count; ++i)
+        if (div[i] != div[0])
+          return false;
+    *divisor = div[0];
+
+    // And finally, the divisor must be >= 2 and <128 (for 8-bit divides),
+    // and <256 otherwise.
+    if (*divisor < 2)
+        return false;
+    if (Type::EqualIgnoringConst(type, AtomicType::VaryingInt8) ||
+        Type::EqualIgnoringConst(type, AtomicType::VaryingUInt8))
+        return *divisor < 128;
+    else
+        return *divisor < 256;
+}
+
+
 Expr *
 BinaryExpr::Optimize() {
     if (arg0 == NULL || arg1 == NULL)
@@ -2269,6 +2345,32 @@ BinaryExpr::Optimize() {
         }
     }
 
+    int divisor;
+    if (op == Div && lCanImproveVectorDivide(arg0, arg1, &divisor)) {
+        Debug(pos, "Improving vector divide by constant %d", divisor);
+
+        std::vector<Symbol *> idivFuns;
+        m->symbolTable->LookupFunction("__fast_idiv", &idivFuns);
+        if (idivFuns.size() == 0) {
+            Warning(pos, "Couldn't find __fast_idiv to optimize integer divide. "
+                    "Are you compiling with --nostdlib?");
+            return this;
+        }
+
+        Expr *idivSymExpr = new FunctionSymbolExpr("__fast_idiv", idivFuns, pos);
+        ExprList *args = new ExprList(arg0, pos);
+        args->exprs.push_back(new ConstExpr(AtomicType::UniformInt32, divisor, arg1->pos));
+        Expr *idivCall = new FunctionCallExpr(idivSymExpr, args, pos);
+
+        idivCall = ::TypeCheck(idivCall);
+        if (idivCall == NULL)
+          return NULL;
+
+        Assert(Type::EqualIgnoringConst(GetType(), idivCall->GetType()));
+        idivCall = new TypeCastExpr(GetType(), idivCall, pos);
+        return ::Optimize(idivCall);
+    }
+
     // From here on out, we're just doing constant folding, so if both args
     // aren't constants then we're done...
     if (constArg0 == NULL || constArg1 == NULL)
@@ -3021,6 +3123,14 @@ static llvm::Value *
 lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
                    llvm::Value *expr1, llvm::Value *expr2,
                    const Type *type) {
+#if 0 // !defined(LLVM_3_1)
+    // Though it should be equivalent, this seems to cause non-trivial
+    // performance regressions versus the below.  This may be related to
+    // http://llvm.org/bugs/show_bug.cgi?id=16941.
+    if (test->getType() != LLVMTypes::Int1VectorType)
+        test = ctx->TruncInst(test, LLVMTypes::Int1VectorType);
+    return ctx->SelectInst(test, expr1, expr2, "select");
+#else
     llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp");
     // Don't need to worry about masking here
     ctx->StoreInst(expr2, resultPtr);
@@ -3029,6 +3139,7 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
            PointerType::GetUniform(type)->LLVMType(g->ctx));
     ctx->StoreInst(expr1, resultPtr, test, type, PointerType::GetUniform(type));
     return ctx->LoadInst(resultPtr, "selectexpr_final");
+#endif // !LLVM_3_1
 }
 
 
@@ -6059,9 +6170,9 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // If we have a bool vector of i32 elements, first truncate
-                // down to a single bit
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // If we have a bool vector of non-i1 elements, first
+                // truncate down to a single bit.
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             // And then do an unisgned int->float cast
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
@@ -6103,8 +6214,8 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
-                // truncate i32 bool vector values to i1s
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
+                // truncate bool vector values to i1s
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                  exprVal, targetType, cOpName);
@@ -6141,7 +6252,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6177,7 +6288,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6219,7 +6330,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6259,7 +6370,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6305,7 +6416,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6345,7 +6456,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6391,7 +6502,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6429,7 +6540,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
         switch (fromType->basicType) {
         case AtomicType::TYPE_BOOL:
             if (fromType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, cOpName);
             cast = ctx->ZExtInst(exprVal, targetType, cOpName);
             break;
@@ -6523,12 +6634,12 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
 
         if (fromType->IsUniformType()) {
             if (toType->IsVaryingType() &&
-                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) {
-                // extend out to i32 bool values from i1 here.  then we'll
-                // turn into a vector below, the way it does for everyone
-                // else...
+                LLVMTypes::BoolVectorType != LLVMTypes::Int1VectorType) {
+                // extend out to an bool as an i8/i16/i32 from the i1 here.
+                // Then we'll turn that into a vector below, the way it
+                // does for everyone else...
                 cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(),
-                                     LLVMGetName(cast, "to_i32bool"));
+                                     LLVMGetName(cast, "to_i_bool"));
             }
         }
         else
diff --git a/ispc.cpp b/ispc.cpp
index 7743d6b2..8a0f16c6 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon";
+    return "neon-i32x4";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -121,19 +121,19 @@ lGetSystemISA() {
             int info2[4];
             __cpuidex(info2, 7, 0);
             if ((info2[1] & (1 << 5)) != 0)
-                return "avx2";
+                return "avx2-i32x8";
             else
-                return "avx1.1";
+                return "avx1.1-i32x8";
         }
         // Regular AVX
-        return "avx";
+        return "avx-i32x8";
     }
     else if ((info[2] & (1 << 19)) != 0)
-        return "sse4";
+        return "sse4-i32x4";
     else if ((info[3] & (1 << 26)) != 0)
-        return "sse2";
+        return "sse2-i32x4";
     else {
-        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
+        Error(SourcePos(), "Unable to detect supported SSE/AVX ISA.  Exiting.");
         exit(1);
     }
 #endif
@@ -186,22 +186,22 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // If a CPU was specified explicitly, try to pick the best
             // possible ISA based on that.
             if (!strcmp(cpu, "core-avx2"))
-                isa = "avx2";
+                isa = "avx2-i32x8";
 #ifdef ISPC_ARM_ENABLED
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon";
+                isa = "neon-i32x4";
 #endif
             else if (!strcmp(cpu, "core-avx-i"))
-                isa = "avx1.1";
+                isa = "avx1.1-i32x8";
             else if (!strcmp(cpu, "sandybridge") ||
                 !strcmp(cpu, "corei7-avx"))
-                isa = "avx";
+                isa = "avx-i32x8";
             else if (!strcmp(cpu, "corei7") ||
                      !strcmp(cpu, "penryn"))
-                isa = "sse4";
+                isa = "sse4-i32x4";
             else
-                isa = "sse2";
+                isa = "sse2-i32x4";
             Warning(SourcePos(), "No --target specified on command-line.  "
                     "Using ISA \"%s\" based on specified CPU \"%s\".", isa,
                     cpu);
@@ -211,12 +211,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             // supports.
             isa = lGetSystemISA();
             Warning(SourcePos(), "No --target specified on command-line.  "
-                    "Using system ISA \"%s\".", isa);
+                    "Using default system target \"%s\".", isa);
         }
     }
 
 #if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
-    if (cpu == NULL && !strcmp(isa, "neon"))
+    if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
         cpu = "cortex-a9";
@@ -241,8 +241,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             }
         }
         if (foundCPU == false) {
-            fprintf(stderr, "Error: CPU type \"%s\" unknown. Supported CPUs: "
-                    "%s.\n", cpu, SupportedTargetCPUs().c_str());
+            Error(SourcePos(), "Error: CPU type \"%s\" unknown. Supported CPUs: "
+                    "%s.", cpu, SupportedCPUs().c_str());
             return;
         }
     }
@@ -251,7 +251,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
     if (arch == NULL) {
 #ifdef ISPC_ARM_ENABLED
-        if (!strcmp(isa, "neon"))
+        if (!strncmp(isa, "neon", 4))
             arch = "arm";
         else
 #endif
@@ -283,7 +283,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
     // Check default LLVM generated targets
-    if (!strcasecmp(isa, "sse2")) {
+    if (!strcasecmp(isa, "sse2") ||
+        !strcasecmp(isa, "sse2-i32x4")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -291,7 +292,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse2-x2")) {
+    else if (!strcasecmp(isa, "sse2-x2") ||
+             !strcasecmp(isa, "sse2-i32x8")) {
         this->m_isa = Target::SSE2;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -299,7 +301,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4")) {
+    else if (!strcasecmp(isa, "sse4") ||
+             !strcasecmp(isa, "sse4-i32x4")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -308,7 +311,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
+    else if (!strcasecmp(isa, "sse4x2") ||
+             !strcasecmp(isa, "sse4-x2") ||
+             !strcasecmp(isa, "sse4-i32x8")) {
         this->m_isa = Target::SSE4;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 8;
@@ -316,7 +321,24 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "generic-4")) {
+    else if (!strcasecmp(isa, "sse4-i8x16")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "sse4-i16x8")) {
+        this->m_isa = Target::SSE4;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "generic-4") ||
+             !strcasecmp(isa, "generic-x4")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
@@ -326,7 +348,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-8")) {
+    else if (!strcasecmp(isa, "generic-8") ||
+             !strcasecmp(isa, "generic-x8")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -336,7 +359,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-16")) {
+    else if (!strcasecmp(isa, "generic-16") ||
+             !strcasecmp(isa, "generic-x16")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -346,7 +370,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-32")) {
+    else if (!strcasecmp(isa, "generic-32") ||
+             !strcasecmp(isa, "generic-x32")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 32;
         this->m_vectorWidth = 32;
@@ -356,7 +381,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-64")) {
+    else if (!strcasecmp(isa, "generic-64") ||
+             !strcasecmp(isa, "generic-x64")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 64;
         this->m_vectorWidth = 64;
@@ -366,14 +392,17 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasTranscendentals = true;
         this->m_hasGather = this->m_hasScatter = true;
     }
-    else if (!strcasecmp(isa, "generic-1")) {
+    else if (!strcasecmp(isa, "generic-1") ||
+             !strcasecmp(isa, "generic-x1")) {
         this->m_isa = Target::GENERIC;
         this->m_nativeVectorWidth = 1;
         this->m_vectorWidth = 1;
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx") || !strcasecmp(isa, "avx1")) {
+    else if (!strcasecmp(isa, "avx") ||
+             !strcasecmp(isa, "avx1") ||
+             !strcasecmp(isa, "avx1-i32x8")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -381,7 +410,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-x2") || !strcasecmp(isa, "avx1-x2")) {
+    else if (!strcasecmp(isa, "avx-x2") ||
+             !strcasecmp(isa, "avx1-x2") ||
+             !strcasecmp(isa, "avx1-i32x16")) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -389,7 +420,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx1.1")) {
+    else if (!strcasecmp(isa, "avx1.1") ||
+             !strcasecmp(isa, "avx1.1-i32x8")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -402,7 +434,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx1.1-x2")) {
+    else if (!strcasecmp(isa, "avx1.1-x2") ||
+             !strcasecmp(isa, "avx1.1-i32x16")) {
         this->m_isa = Target::AVX11;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 16;
@@ -415,7 +448,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasRand = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2")) {
+    else if (!strcasecmp(isa, "avx2") ||
+             !strcasecmp(isa, "avx2-i32x8")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 8;
         this->m_vectorWidth = 8;
@@ -433,7 +467,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "avx2-x2")) {
+    else if (!strcasecmp(isa, "avx2-x2") ||
+             !strcasecmp(isa, "avx2-i32x16")) {
         this->m_isa = Target::AVX2;
         this->m_nativeVectorWidth = 16;
         this->m_vectorWidth = 16;
@@ -452,8 +487,27 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #endif
     }
 #ifdef ISPC_ARM_ENABLED
-    else if (!strcasecmp(isa, "neon")) {
-        this->m_isa = Target::NEON;
+    else if (!strcasecmp(isa, "neon-i8x16")) {
+        this->m_isa = Target::NEON8;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "neon-i16x8")) {
+        this->m_isa = Target::NEON16;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "neon") ||
+             !strcasecmp(isa, "neon-i32x4")) {
+        this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -463,8 +517,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 #endif
     else {
-        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
-                isa, SupportedTargetISAs());
+        Error(SourcePos(), "Target \"%s\" is unknown.  Choices are: %s.",
+                isa, SupportedTargets());
         error = true;
     }
 
@@ -477,7 +531,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
 #ifdef ISPC_ARM_ENABLED
-        if (m_isa == Target::NEON)
+        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
+            m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
 #endif
 #if !defined(LLVM_3_1)
@@ -557,7 +612,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 
 
 std::string
-Target::SupportedTargetCPUs() {
+Target::SupportedCPUs() {
     std::string ret;
     int count = sizeof(supportedCPUs) / sizeof(supportedCPUs[0]);
     for (int i = 0; i < count; ++i) {
@@ -570,7 +625,7 @@ Target::SupportedTargetCPUs() {
 
 
 const char *
-Target::SupportedTargetArchs() {
+Target::SupportedArchs() {
     return
 #ifdef ISPC_ARM_ENABLED
         "arm, "
@@ -580,14 +635,18 @@ Target::SupportedTargetArchs() {
 
 
 const char *
-Target::SupportedTargetISAs() {
+Target::SupportedTargets() {
     return
 #ifdef ISPC_ARM_ENABLED
-        "neon, "
+        "neon-i8x16, neon-16x8, neon-32x4, "
 #endif
-        "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
-        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
-        ", generic-1, generic-4, generic-8, generic-16, generic-32";
+        "sse2-i32x4, sse2-i32x8, "
+        "sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, "
+        "avx1-i32x8, avx1-i32x16, "
+        "avx1.1-i32x8, avx1.1-i32x16, "
+        "avx2-i32x8, avx2-i32x16, "
+        "generic-x1, generic-x4, generic-x8, generic-x16, "
+            "generic-x32, generic-x64";
 }
 
 
@@ -624,9 +683,13 @@ const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
 #ifdef ISPC_ARM_ENABLED
-    case Target::NEON:
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
 #endif
-        return "neon";
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
diff --git a/ispc.h b/ispc.h
index bb9e2b31..fc78e415 100644
--- a/ispc.h
+++ b/ispc.h
@@ -181,9 +181,10 @@ public:
         added or the enumerant values are reordered.  */
     enum ISA {
 #ifdef ISPC_ARM_ENABLED
-        NEON,
+               NEON32, NEON16, NEON8,
 #endif
-        SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+               SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+               NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
@@ -191,16 +192,16 @@ public:
     Target(const char *arch, const char *cpu, const char *isa, bool pic);
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target ISAs. */
-    static const char *SupportedTargetISAs();
+        supported compilation targets. */
+    static const char *SupportedTargets();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target CPUs. */
-    static std::string SupportedTargetCPUs();
+        supported CPUs. */
+    static std::string SupportedCPUs();
 
     /** Returns a comma-delimited string giving the names of the currently
-        supported target architectures. */
-    static const char *SupportedTargetArchs();
+        supported architectures. */
+    static const char *SupportedArchs();
 
     /** Returns a triple string specifying the target architecture, vendor,
         and environment. */
@@ -487,7 +488,7 @@ struct Globals {
 
     /** There are a number of math libraries that can be used for
         transcendentals and the like during program compilation. */
-    enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System };
+    enum MathLib { Math_ISPC, Math_ISPCFast, Math_System };
     MathLib mathLib;
 
     /** Records whether the ispc standard library should be made available
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 36fbad5d..b4a8b764 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -51,10 +51,16 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-sse4-16-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse4-x2-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-generic.cpp" />
-    <ClCompile Include="$(Configuration)\gen-stdlib-x86.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask1.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask8.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask16.cpp" />
+    <ClCompile Include="$(Configuration)\gen-stdlib-mask32.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="$(Configuration)\lex.cc">
       <DisableSpecificWarnings>4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -97,11 +103,13 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; $(Configuration)/gen-stdlib-x86.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; $(Configuration)/gen-stdlib-generic.cpp;
+      <Command>%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask1 &gt; $(Configuration)/gen-stdlib-mask1.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask8 &gt; $(Configuration)/gen-stdlib-mask8.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask16 &gt; $(Configuration)/gen-stdlib-mask16.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py mask32 &gt; $(Configuration)/gen-stdlib-mask32.cpp;
 </Command>
-      <Outputs>$(Configuration)/gen-stdlib-generic.cpp;$(Configuration)/gen-stdlib-x86.cpp</Outputs>
-      <Message>Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Outputs>$(Configuration)/gen-stdlib-mask1.cpp;$(Configuration)/gen-stdlib-mask8.cpp;$(Configuration)/gen-stdlib-mask16.cpp;$(Configuration)/gen-stdlib-mask32.cpp</Outputs>
+      <Message>Building gen-stdlib-{mask1,8,16,32}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
@@ -131,6 +139,42 @@
       <Message>Building gen-bitcode-sse4-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-8.ll | python bitcode2cpp.py builtins\target-sse4-8.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 32bit &gt; $(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-sse4-16.ll | python bitcode2cpp.py builtins\target-sse4-16.ll 64bit &gt; $(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-sse4-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-sse4-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-sse4-x2.ll">
       <FileType>Document</FileType>
diff --git a/lex.ll b/lex.ll
index f6633fce..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -77,6 +77,8 @@ static int allTokens[] = {
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
   TOKEN_FLOAT_CONSTANT,
+  TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
+  TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
   TOKEN_INT64_CONSTANT, TOKEN_UINT64_CONSTANT,
   TOKEN_INC_OP, TOKEN_DEC_OP, TOKEN_LEFT_OP, TOKEN_RIGHT_OP, TOKEN_LE_OP,
@@ -150,6 +152,10 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
+    tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
+    tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
+    tokenToName[TOKEN_UINT16_CONSTANT] = "TOKEN_UINT16_CONSTANT";
     tokenToName[TOKEN_INT32_CONSTANT] = "TOKEN_INT32_CONSTANT";
     tokenToName[TOKEN_UINT32_CONSTANT] = "TOKEN_UINT32_CONSTANT";
     tokenToName[TOKEN_INT64_CONSTANT] = "TOKEN_INT64_CONSTANT";
@@ -260,6 +266,10 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
+    tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
+    tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
+    tokenNameRemap["TOKEN_UINT16_CONSTANT"] = "unsigned int16 constant";
     tokenNameRemap["TOKEN_INT32_CONSTANT"] = "int32 constant";
     tokenNameRemap["TOKEN_UINT32_CONSTANT"] = "unsigned int32 constant";
     tokenNameRemap["TOKEN_INT64_CONSTANT"] = "int64 constant";
@@ -599,7 +609,22 @@ lParseInteger(bool dotdotdot) {
         }
         else {
             // No u or l suffix
-            // First, see if we can fit this into a 32-bit integer...
+            // If we're compiling to an 8-bit mask target and the constant
+            // fits into 8 bits, return an 8-bit int.
+            if (g->target->getMaskBitCount() == 8) {
+                if (yylval.intVal <= 0x7fULL)
+                    return TOKEN_INT8_CONSTANT;
+                else if (yylval.intVal <= 0xffULL)
+                    return TOKEN_UINT8_CONSTANT;
+            }
+            // And similarly for 16-bit masks and constants
+            if (g->target->getMaskBitCount() == 16) {
+                if (yylval.intVal <= 0x7fffULL)
+                    return TOKEN_INT16_CONSTANT;
+                else if (yylval.intVal <= 0xffffULL)
+                    return TOKEN_UINT16_CONSTANT;
+            }
+            // Otherwise, see if we can fit this into a 32-bit integer...
             if (yylval.intVal <= 0x7fffffffULL)
                 return TOKEN_INT32_CONSTANT;
             else if (yylval.intVal <= 0xffffffffULL)
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 26c18bf5..180c8676 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -115,13 +115,25 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.getVectorWidth());
-    else {
-        Assert(target.getMaskBitCount() == 32);
+        break;
+    case 8:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt8Ty(*ctx), target.getVectorWidth());
+        break;
+    case 16:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt16Ty(*ctx), target.getVectorWidth());
+        break;
+    case 32:
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
+        break;
+    default:
+        FATAL("Unhandled mask width for initializing MaskType");
     }
 
     LLVMTypes::Int1VectorType =
@@ -154,12 +166,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
                                         false /*unsigned*/); // 0x1
-    else
+        break;
+    case 8:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), -1,
+                                        true /*signed*/); // 0xff
+        break;
+    case 16:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), -1,
+                                        true /*signed*/); // 0xffff
+        break;
+    case 32:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
+        break;
+    default:
+        FATAL("Unhandled mask width for onMask");
+    }
 
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskOnes.push_back(onMask);
@@ -167,13 +193,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    if (target.getMaskBitCount() == 1)
+    switch (target.getMaskBitCount()) {
+    case 1:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
                                          true /*signed*/);
-    else
+        break;
+    case 8:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt8Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 16:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt16Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
+    case 32:
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
-
+        break;
+    default:
+        FATAL("Unhandled mask width for offMask");
+    }
     for (int i = 0; i < target.getVectorWidth(); ++i)
         maskZeros.push_back(offMask);
     LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
@@ -444,9 +483,14 @@ LLVMBoolVector(bool b) {
     if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int16Type, b ? 0xffff : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int8Type, b ? 0xff : 0,
+                                   false /*unsigned*/);
     else {
-        Assert(LLVMTypes::BoolVectorType->getElementType() ==
-               llvm::Type::getInt1Ty(*g->ctx));
+        Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
         v = b ? LLVMTrue : LLVMFalse;
     }
 
@@ -465,9 +509,14 @@ LLVMBoolVector(const bool *bvec) {
         if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int16Type, bvec[i] ? 0xffff : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int8VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int8Type, bvec[i] ? 0xff : 0,
+                                       false /*unsigned*/);
         else {
-            Assert(LLVMTypes::BoolVectorType->getElementType() ==
-                   llvm::Type::getInt1Ty(*g->ctx));
+            Assert(LLVMTypes::BoolVectorType == LLVMTypes::Int1VectorType);
             v = bvec[i] ? LLVMTrue : LLVMFalse;
         }
 
diff --git a/main.cpp b/main.cpp
index b4a02b9f..58daa2d3 100644
--- a/main.cpp
+++ b/main.cpp
@@ -85,13 +85,16 @@ usage(int ret) {
     printf("                          \t\taddressing calculations are done by default, even\n");
     printf("                          \t\ton 64-bit target architectures.)\n");
     printf("    [--arch={%s}]\t\tSelect target architecture\n",
-           Target::SupportedTargetArchs());
+           Target::SupportedArchs());
     printf("    [--c++-include-file=<name>]\t\tSpecify name of file to emit in #include statement in generated C++ code.\n");
 #ifndef ISPC_IS_WINDOWS
     printf("    [--colored-output]\t\tAlways use terminal colors in error/warning messages.\n");
 #endif
-    printf("    [--cpu=<cpu>]\t\t\tSelect target CPU type\n");
-    printf("         <cpu>={%s}\n", Target::SupportedTargetCPUs().c_str());
+    printf("    ");
+    char cpuHelp[2048];
+    sprintf(cpuHelp, "[--cpu=<cpu>]\t\t\tSelect target CPU type\n<cpu>={%s}\n",
+            Target::SupportedCPUs().c_str());
+    PrintWithWordBreaks(cpuHelp, 16, TerminalWidth(), stdout);
     printf("    [-D<foo>]\t\t\t\t#define given value when running preprocessor\n");
     printf("    [--dev-stub <filename>]\t\tEmit device-side offload stub functions to file\n");
     printf("    [--emit-asm]\t\t\tGenerate assembly language file as output\n");
@@ -109,7 +112,6 @@ usage(int ret) {
     printf("    [--math-lib=<option>]\t\tSelect math library\n");
     printf("        default\t\t\t\tUse ispc's built-in math functions\n");
     printf("        fast\t\t\t\tUse high-performance but lower-accuracy math functions\n");
-    printf("        svml\t\t\t\tUse the Intel(r) SVML math libraries\n");
     printf("        system\t\t\t\tUse the system's math library (*may be quite slow*)\n");
     printf("    [-MMM <filename>\t\t\t\tWrite #include dependencies to given file.\n");
     printf("    [--nostdlib]\t\t\tDon't make the ispc standard library available\n");
@@ -127,7 +129,11 @@ usage(int ret) {
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
     printf("    [--quiet]\t\t\t\tSuppress all output\n");
-    printf("    [--target=<isa>]\t\t\tSelect target ISA. <isa>={%s}\n", Target::SupportedTargetISAs());
+    printf("    ");
+    char targetHelp[2048];
+    sprintf(targetHelp, "[--target=<t>]\t\t\tSelect target ISA and width.\n"
+            "<t>={%s}", Target::SupportedTargets());
+    PrintWithWordBreaks(targetHelp, 24, TerminalWidth(), stdout);
     printf("    [--version]\t\t\t\tPrint ispc version\n");
     printf("    [--werror]\t\t\t\tTreat warnings as errors\n");
     printf("    [--woff]\t\t\t\tDisable warnings\n");
@@ -404,8 +410,6 @@ int main(int Argc, char *Argv[]) {
                 g->mathLib = Globals::Math_ISPC;
             else if (!strcmp(lib, "fast"))
                 g->mathLib = Globals::Math_ISPCFast;
-            else if (!strcmp(lib, "svml"))
-                g->mathLib = Globals::Math_SVML;
             else if (!strcmp(lib, "system"))
                 g->mathLib = Globals::Math_System;
             else {
diff --git a/module.cpp b/module.cpp
index 85bf242c..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     char *p = targetMacro;
     while (*p) {
         *p = toupper(*p);
+        if (*p == '-') *p = '_';
         ++p;
     }
     opts.addMacroDef(targetMacro);
diff --git a/opt.cpp b/opt.cpp
index 4602da43..75eae20c 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -88,6 +88,7 @@
 #include <llvm/Analysis/Verifier.h>
 #include <llvm/Analysis/Passes.h>
 #include <llvm/Support/raw_ostream.h>
+#include <llvm/Support/PatternMatch.h>
 #if defined(LLVM_3_1)
   #include <llvm/Analysis/DebugInfo.h>
 #else
@@ -111,7 +112,8 @@
 #endif
 
 static llvm::Pass *CreateIntrinsicsOptPass();
-static llvm::Pass *CreateVSelMovmskOptPass();
+static llvm::Pass *CreateInstructionSimplifyPass();
+static llvm::Pass *CreatePeepholePass();
 
 static llvm::Pass *CreateImproveMemoryOpsPass();
 static llvm::Pass *CreateGatherCoalescePass();
@@ -516,6 +518,9 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createDeadInstEliminationPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
+        optPM.add(llvm::createPromoteMemoryToRegisterPass());
+        optPM.add(llvm::createAggressiveDCEPass());
+
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
             optPM.add(llvm::createInstructionCombiningPass(), 210);
@@ -523,7 +528,7 @@ Optimize(llvm::Module *module, int optLevel) {
         }
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass(), 215);
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
         optPM.add(llvm::createDeadInstEliminationPass(), 220);
 
@@ -557,6 +562,7 @@ Optimize(llvm::Module *module, int optLevel) {
         // InstructionCombiningPass. See r184459 for details.
         optPM.add(llvm::createSimplifyLibCallsPass(), 240);
 #endif
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass(), 241);
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCFGSimplificationPass());
@@ -566,7 +572,7 @@ Optimize(llvm::Module *module, int optLevel) {
 
         if (!g->opt.disableMaskAllOnOptimizations) {
             optPM.add(CreateIntrinsicsOptPass(), 250);
-            optPM.add(CreateVSelMovmskOptPass());
+            optPM.add(CreateInstructionSimplifyPass());
         }
 
         if (g->opt.disableGatherScatterOptimizations == false &&
@@ -586,7 +592,7 @@ Optimize(llvm::Module *module, int optLevel) {
         optPM.add(llvm::createFunctionInliningPass(), 265);
         optPM.add(llvm::createConstantPropagationPass());
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         if (g->opt.disableGatherScatterOptimizations == false &&
             g->target->getVectorWidth() > 1) {
@@ -596,26 +602,28 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(llvm::createIPSCCPPass(), 275);
         optPM.add(llvm::createDeadArgEliminationPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createInstructionCombiningPass());
         optPM.add(llvm::createCFGSimplificationPass());
 
         if (g->opt.disableHandlePseudoMemoryOps == false) {
             optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
         }
-
         optPM.add(CreateIntrinsicsOptPass(),281);
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createFunctionInliningPass());
         optPM.add(llvm::createArgumentPromotionPass());
         optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold, false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createReassociatePass());
         optPM.add(llvm::createLoopRotatePass());
         optPM.add(llvm::createLICMPass());
         optPM.add(llvm::createLoopUnswitchPass(false));
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createIndVarSimplifyPass());
         optPM.add(llvm::createLoopIdiomPass());
         optPM.add(llvm::createLoopDeletionPass());
@@ -626,17 +634,22 @@ Optimize(llvm::Module *module, int optLevel) {
 
         optPM.add(CreateIsCompileTimeConstantPass(true));
         optPM.add(CreateIntrinsicsOptPass());
-        optPM.add(CreateVSelMovmskOptPass());
+        optPM.add(CreateInstructionSimplifyPass());
 
         optPM.add(llvm::createMemCpyOptPass());
         optPM.add(llvm::createSCCPPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
         optPM.add(llvm::createJumpThreadingPass());
         optPM.add(llvm::createCorrelatedValuePropagationPass());
         optPM.add(llvm::createDeadStoreEliminationPass());
         optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createCFGSimplificationPass());
         optPM.add(llvm::createInstructionCombiningPass());
+        optPM.add(CreateInstructionSimplifyPass());
+        optPM.add(CreatePeepholePass());
+        optPM.add(llvm::createFunctionInliningPass());
+        optPM.add(llvm::createAggressiveDCEPass());
         optPM.add(llvm::createStripDeadPrototypesPass());
         optPM.add(CreateMakeInternalFuncsStaticPass());
         optPM.add(llvm::createGlobalDCEPass());
@@ -720,14 +733,17 @@ IntrinsicsOpt::IntrinsicsOpt()
     // All of the mask instructions we may encounter.  Note that even if
     // compiling for AVX, we may still encounter the regular 4-wide SSE
     // MOVMSK instruction.
-    llvm::Function *sseMovmsk =
+    llvm::Function *ssei8Movmsk =
+        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse2_pmovmskb_128);
+    maskInstructions.push_back(ssei8Movmsk);
+    llvm::Function *sseFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
-    maskInstructions.push_back(sseMovmsk);
+    maskInstructions.push_back(sseFloatMovmsk);
     maskInstructions.push_back(m->module->getFunction("__movmsk"));
-    llvm::Function *avxMovmsk =
+    llvm::Function *avxFloatMovmsk =
         llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_avx_movmsk_ps_256);
-    Assert(avxMovmsk != NULL);
-    maskInstructions.push_back(avxMovmsk);
+    Assert(avxFloatMovmsk != NULL);
+    maskInstructions.push_back(avxFloatMovmsk);
 
     // And all of the blend instructions
     blendInstructions.push_back(BlendInstruction(
@@ -974,80 +990,153 @@ CreateIntrinsicsOptPass() {
     @todo The better thing to do would be to submit a patch to LLVM to get
     these; they're presumably pretty simple patterns to match.
 */
-class VSelMovmskOpt : public llvm::BasicBlockPass {
+class InstructionSimplifyPass : public llvm::BasicBlockPass {
 public:
-    VSelMovmskOpt()
+    InstructionSimplifyPass()
         : BasicBlockPass(ID) { }
 
     const char *getPassName() const { return "Vector Select Optimization"; }
     bool runOnBasicBlock(llvm::BasicBlock &BB);
 
     static char ID;
+
+private:
+    static bool simplifySelect(llvm::SelectInst *selectInst,
+                               llvm::BasicBlock::iterator iter);
+    static llvm::Value *simplifyBoolVec(llvm::Value *value);
+    static bool simplifyCall(llvm::CallInst *callInst,
+                               llvm::BasicBlock::iterator iter);
 };
 
-char VSelMovmskOpt::ID = 0;
+char InstructionSimplifyPass::ID = 0;
+
+
+llvm::Value *
+InstructionSimplifyPass::simplifyBoolVec(llvm::Value *value) {
+    llvm::TruncInst *trunc = llvm::dyn_cast<llvm::TruncInst>(value);
+    if (trunc != NULL) {
+        // Convert trunc({sext,zext}(i1 vector)) -> (i1 vector)
+        llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(value);
+        if (sext && 
+            sext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return sext->getOperand(0);
+
+        llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(value);
+        if (zext && 
+            zext->getOperand(0)->getType() == LLVMTypes::Int1VectorType)
+            return zext->getOperand(0);
+    }
+
+    llvm::ICmpInst *icmp = llvm::dyn_cast<llvm::ICmpInst>(value);
+    if (icmp != NULL) {
+        // icmp(ne, {sext,zext}(foo), zeroinitializer) -> foo
+        if (icmp->getSignedPredicate() == llvm::CmpInst::ICMP_NE) {
+            llvm::Value *op1 = icmp->getOperand(1);
+            if (llvm::isa<llvm::ConstantAggregateZero>(op1)) {
+                llvm::Value *op0 = icmp->getOperand(0);
+                llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(op0);
+                if (sext)
+                    return sext->getOperand(0);
+                llvm::ZExtInst *zext = llvm::dyn_cast<llvm::ZExtInst>(op0);
+                if (zext)
+                    return zext->getOperand(0);
+            }
+        }
+    }
+    return NULL;
+}
 
 
 bool
-VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    DEBUG_START_PASS("VSelMovmaskOpt");
+InstructionSimplifyPass::simplifySelect(llvm::SelectInst *selectInst,
+                                        llvm::BasicBlock::iterator iter) {
+    if (selectInst->getType()->isVectorTy() == false)
+        return false;
+
+    llvm::Value *factor = selectInst->getOperand(0);
+
+    // Simplify all-on or all-off mask values
+    MaskStatus maskStatus = lGetMaskStatus(factor);
+    llvm::Value *value = NULL;
+    if (maskStatus == ALL_ON)
+        // Mask all on -> replace with the first select value
+        value = selectInst->getOperand(1);
+    else if (maskStatus == ALL_OFF)
+        // Mask all off -> replace with the second select value
+        value = selectInst->getOperand(2);
+    if (value != NULL) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, value);
+        return true;
+    }
+
+    // Sometimes earlier LLVM optimization passes generate unnecessarily
+    // complex expressions for the selection vector, which in turn confuses
+    // the code generators and leads to sub-optimal code (particularly for
+    // 8 and 16-bit masks).  We'll try to simplify them out here so that
+    // the code generator patterns match..
+    if ((factor = simplifyBoolVec(factor)) != NULL) {
+        llvm::Instruction *newSelect =
+            llvm::SelectInst::Create(factor, selectInst->getOperand(1),
+                                     selectInst->getOperand(2),
+                                     selectInst->getName());
+        llvm::ReplaceInstWithInst(selectInst, newSelect);
+        return true;
+    }
+
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::simplifyCall(llvm::CallInst *callInst,
+                                      llvm::BasicBlock::iterator iter) {
+    llvm::Function *calledFunc = callInst->getCalledFunction();
+
+    // Turn a __movmsk call with a compile-time constant vector into the
+    // equivalent scalar value.
+    if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
+        return false;
+
+    uint64_t mask;
+    if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
+        llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                   iter, LLVMInt64(mask));
+        return true;
+    }
+    return false;
+}
+
+
+bool
+InstructionSimplifyPass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("InstructionSimplify");
 
     bool modifiedAny = false;
 
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
-        if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
-            llvm::Value *factor = selectInst->getOperand(0);
-
-            MaskStatus maskStatus = lGetMaskStatus(factor);
-            llvm::Value *value = NULL;
-            if (maskStatus == ALL_ON)
-                // Mask all on -> replace with the first select value
-                value = selectInst->getOperand(1);
-            else if (maskStatus == ALL_OFF)
-                // Mask all off -> replace with the second select value
-                value = selectInst->getOperand(2);
-
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
-            }
+        if (selectInst && simplifySelect(selectInst, iter)) {
+            modifiedAny = true;
+            goto restart;
         }
-
         llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (callInst == NULL)
-            continue;
-
-        llvm::Function *calledFunc = callInst->getCalledFunction();
-        if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
-            continue;
-
-        uint64_t mask;
-        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
-#if 0
-            fprintf(stderr, "mask %d\n", mask);
-            callInst->getArgOperand(0)->dump();
-            fprintf(stderr, "-----------\n");
-#endif
-            llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                       iter, LLVMInt64(mask));
+        if (callInst && simplifyCall(callInst, iter)) {
             modifiedAny = true;
             goto restart;
         }
     }
 
-    DEBUG_END_PASS("VSelMovMskOpt");
+    DEBUG_END_PASS("InstructionSimplify");
 
     return modifiedAny;
 }
 
 
 static llvm::Pass *
-CreateVSelMovmskOptPass() {
-    return new VSelMovmskOpt;
+CreateInstructionSimplifyPass() {
+    return new InstructionSimplifyPass;
 }
 
 
@@ -4359,6 +4448,14 @@ char MakeInternalFuncsStaticPass::ID = 0;
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
+        "__avg_up_uint8",
+        "__avg_up_int8",
+        "__avg_up_uint16",
+        "__avg_up_int16",
+        "__avg_down_uint8",
+        "__avg_down_int8",
+        "__avg_down_uint16",
+        "__avg_down_int16",
         "__fast_masked_vload",
         "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16",
         "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64",
@@ -4438,3 +4535,391 @@ static llvm::Pass *
 CreateMakeInternalFuncsStaticPass() {
     return new MakeInternalFuncsStaticPass;
 }
+
+
+///////////////////////////////////////////////////////////////////////////
+// PeepholePass
+
+class PeepholePass : public llvm::BasicBlockPass {
+public:
+    PeepholePass();
+
+    const char *getPassName() const { return "Peephole Optimizations"; }
+    bool runOnBasicBlock(llvm::BasicBlock &BB);
+
+    static char ID;
+};
+
+char PeepholePass::ID = 0;
+
+PeepholePass::PeepholePass()
+    : BasicBlockPass(ID) {
+}
+
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+
+using namespace llvm::PatternMatch;
+
+template<typename Op_t, unsigned Opcode>
+struct CastClassTypes_match {
+    Op_t Op;
+    const llvm::Type *fromType, *toType;
+
+    CastClassTypes_match(const Op_t &OpMatch, const llvm::Type *f,
+                         const llvm::Type *t)
+        : Op(OpMatch), fromType(f), toType(t) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        if (llvm::Operator *O = llvm::dyn_cast<llvm::Operator>(V))
+            return (O->getOpcode() == Opcode && Op.match(O->getOperand(0)) &&
+                    O->getType() == toType &&
+                    O->getOperand(0)->getType() == fromType);
+        return false;
+    }
+};
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt8To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int8VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc16To8(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int8VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::SExt>
+m_SExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::SExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::ZExt>
+m_ZExt16To32(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::ZExt>(
+        Op,
+        LLVMTypes::Int16VectorType,
+        LLVMTypes::Int32VectorType);
+}
+
+
+template<typename OpTy>
+inline CastClassTypes_match<OpTy, llvm::Instruction::Trunc>
+m_Trunc32To16(const OpTy &Op) {
+    return CastClassTypes_match<OpTy, llvm::Instruction::Trunc>(
+        Op,
+        LLVMTypes::Int32VectorType,
+        LLVMTypes::Int16VectorType);
+}
+
+template<typename Op_t>
+struct UDiv2_match {
+    Op_t Op;
+
+    UDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::UDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::LShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline UDiv2_match<V>
+m_UDiv2(const V &v) {
+    return UDiv2_match<V>(v);
+}
+
+template<typename Op_t>
+struct SDiv2_match {
+    Op_t Op;
+
+    SDiv2_match(const Op_t &OpMatch)
+        : Op(OpMatch) {}
+
+    template<typename OpTy>
+    bool match(OpTy *V) {
+        llvm::BinaryOperator *bop;
+        llvm::ConstantDataVector *cdv;
+        if ((bop = llvm::dyn_cast<llvm::BinaryOperator>(V)) &&
+            (cdv = llvm::dyn_cast<llvm::ConstantDataVector>(bop->getOperand(1))) &&
+            cdv->getSplatValue() != NULL) {
+            const llvm::APInt &apInt = cdv->getUniqueInteger();
+
+            switch (bop->getOpcode()) {
+            case llvm::Instruction::SDiv:
+                // divide by 2
+                return (apInt.isIntN(2) && Op.match(bop->getOperand(0)));
+            case llvm::Instruction::AShr:
+                // shift left by 1
+                return (apInt.isIntN(1) && Op.match(bop->getOperand(0)));
+            default:
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+template<typename V>
+inline SDiv2_match<V>
+m_SDiv2(const V &v) {
+    return SDiv2_match<V>(v);
+}
+
+// Returns true if the given function has a call to an intrinsic function
+// in its definition.
+static bool
+lHasIntrinsicInDefinition(llvm::Function *func) {
+  llvm::Function::iterator bbiter = func->begin();
+  for (; bbiter != func->end(); ++bbiter) {
+    for (llvm::BasicBlock::iterator institer = bbiter->begin();
+         institer != bbiter->end(); ++institer) {
+      if (llvm::isa<llvm::IntrinsicInst>(institer))
+        return true;
+    }
+  }
+  return false;
+}
+
+static llvm::Instruction *
+lGetBinaryIntrinsic(const char *name, llvm::Value *opa, llvm::Value *opb) {
+  llvm::Function *func = m->module->getFunction(name);
+  Assert(func != NULL);
+
+  // Make sure that the definition of the llvm::Function has a call to an
+  // intrinsic function in its instructions; otherwise we will generate
+  // infinite loops where we "helpfully" turn the default implementations
+  // of target builtins like __avg_up_uint8 that are implemented with plain
+  // arithmetic ops into recursive calls to themselves.
+  if (lHasIntrinsicInDefinition(func))
+    return lCallInst(func, opa, opb, name);
+  else
+    return NULL;
+}
+
+//////////////////////////////////////////////////
+
+static llvm::Instruction *
+lMatchAvgUpUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt8To16(m_Value(opa)),
+                  m_Add(m_ZExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt8To16(m_Value(opa)), m_ZExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt8(llvm::Value *inst) {
+    // (unsigned int8)(((unsigned int16)a + (unsigned int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_UDiv2(
+                    m_Add(m_ZExt8To16(m_Value(opa)),
+                          m_ZExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_UDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_ZExt16To32(m_Value(opa)),
+                  m_Add(m_ZExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_ZExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_ZExt16To32(m_Value(opa)), m_ZExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownUInt16(llvm::Value *inst) {
+    // (unsigned int16)(((unsigned int32)a + (unsigned int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_UDiv2(
+                    m_Add(m_ZExt16To32(m_Value(opa)),
+                          m_ZExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_uint16", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgUpInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc16To8(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt8To16(m_Value(opa)),
+                  m_Add(m_SExt8To16(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_APInt(delta)),
+                  m_SExt8To16(m_Value(opb)))),
+        m_Add(m_Add(m_SExt8To16(m_Value(opa)), m_SExt8To16(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_int8", opa, opb);
+    }
+    return NULL;
+}
+
+
+static llvm::Instruction *
+lMatchAvgDownInt8(llvm::Value *inst) {
+    // (int8)(((int16)a + (int16)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc16To8(m_SDiv2(
+                    m_Add(m_SExt8To16(m_Value(opa)),
+                          m_SExt8To16(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int8", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgUpInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b + 1)/2)
+    llvm::Value *opa, *opb;
+    const llvm::APInt *delta;
+    if (match(inst, m_Trunc32To16(m_SDiv2(m_CombineOr(
+        m_CombineOr(
+            m_Add(m_SExt16To32(m_Value(opa)),
+                  m_Add(m_SExt16To32(m_Value(opb)), m_APInt(delta))),
+            m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_APInt(delta)),
+                  m_SExt16To32(m_Value(opb)))),
+        m_Add(m_Add(m_SExt16To32(m_Value(opa)), m_SExt16To32(m_Value(opb))),
+              m_APInt(delta))))))) {
+        if (delta->isIntN(1) == false)
+            return NULL;
+
+        return lGetBinaryIntrinsic("__avg_up_int16", opa, opb);
+    }
+    return NULL;
+}
+
+static llvm::Instruction *
+lMatchAvgDownInt16(llvm::Value *inst) {
+    // (int16)(((int32)a + (int32)b)/2)
+    llvm::Value *opa, *opb;
+    if (match(inst, m_Trunc32To16(m_SDiv2(
+                    m_Add(m_SExt16To32(m_Value(opa)),
+                          m_SExt16To32(m_Value(opb))))))) {
+        return lGetBinaryIntrinsic("__avg_down_int16", opa, opb);
+    }
+    return NULL;
+}
+#endif // !LLVM_3_1 && !LLVM_3_2
+
+bool
+PeepholePass::runOnBasicBlock(llvm::BasicBlock &bb) {
+    DEBUG_START_PASS("PeepholePass");
+
+    bool modifiedAny = false;
+ restart:
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::Instruction *inst = &*iter;
+
+        llvm::Instruction *builtinCall = NULL;
+#if !defined(LLVM_3_1) && !defined(LLVM_3_2)
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownUInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgUpInt16(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt8(inst);
+        if (!builtinCall)
+          builtinCall = lMatchAvgDownInt16(inst);
+#endif // !LLVM_3_1 && !LLVM_3_2
+        if (builtinCall != NULL) {
+          llvm::ReplaceInstWithInst(inst, builtinCall);
+          modifiedAny = true;
+          goto restart;
+        }
+    }
+
+    DEBUG_END_PASS("PeepholePass");
+
+    return modifiedAny;
+}
+
+static llvm::Pass *
+CreatePeepholePass() {
+  return new PeepholePass;
+}
diff --git a/parse.yy b/parse.yy
index 3ad815cf..5fc01cb0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -179,6 +179,8 @@ struct ForeachDimension {
 }
 
 
+%token TOKEN_INT8_CONSTANT TOKEN_UINT8_CONSTANT
+%token TOKEN_INT16_CONSTANT TOKEN_UINT16_CONSTANT
 %token TOKEN_INT32_CONSTANT TOKEN_UINT32_CONSTANT
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
@@ -291,6 +293,22 @@ primary_expression
             Error(@1, "Undeclared symbol \"%s\".%s", name, alts.c_str());
         }
     }
+    | TOKEN_INT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt8->GetAsConstType(),
+                           (int8_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT8_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt8->GetAsConstType(),
+                           (uint8_t)yylval.intVal, @1);
+    }
+    | TOKEN_INT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformInt16->GetAsConstType(),
+                           (int16_t)yylval.intVal, @1);
+    }
+    | TOKEN_UINT16_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformUInt16->GetAsConstType(),
+                           (uint16_t)yylval.intVal, @1);
+    }
     | TOKEN_INT32_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformInt32->GetAsConstType(),
                            (int32_t)yylval.intVal, @1);
@@ -1233,7 +1251,10 @@ declarator
     ;
 
 int_constant
-    : TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    : TOKEN_INT8_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT16_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT32_CONSTANT { $$ = yylval.intVal; }
+    | TOKEN_INT64_CONSTANT { $$ = yylval.intVal; }
     ;
 
 direct_declarator
@@ -2148,8 +2169,24 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = g->target->getMaskBitCount() == 1 ?
-        AtomicType::VaryingBool : AtomicType::VaryingUInt32;
+    const Type *t = NULL;
+    switch (g->target->getMaskBitCount()) {
+    case 1:
+        t = AtomicType::VaryingBool;
+        break;
+    case 8:
+        t = AtomicType::VaryingUInt8;
+        break;
+    case 16:
+        t = AtomicType::VaryingUInt16;
+        break;
+    case 32:
+        t = AtomicType::VaryingUInt32;
+        break;
+    default:
+        FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
+    }
+
     t = t->GetAsConstType();
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
@@ -2241,7 +2278,11 @@ lGetConstantInt(Expr *expr, int *value, SourcePos pos, const char *usage) {
             Error(pos, "%s must be representable with a 32-bit integer.", usage);
             return false;
         }
-        *value = (int)ci->getZExtValue();
+        const Type *type = expr->GetType();
+        if (type->IsUnsignedType())
+            *value = (int)ci->getZExtValue();
+        else
+            *value = (int)ci->getSExtValue();
         return true;
     }
 }
diff --git a/run_tests.py b/run_tests.py
index 710bd274..9729930f 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (sse2-i32x4, sse2-i32x8, sse4-i32x4, sse4-i32x8, sse4-i16x8, sse4-i8x16, avx1-i32x8, avx1-i32x16, avx1.1-i32x8, avx1.1-i32x16, avx2-i32x8, avx2-i32x16, generic-x1, generic-x4, generic-x8, generic-x16, generic-x32, generic-x64)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',
@@ -297,7 +297,7 @@ def run_test(testname):
         firstline = firstline.rstrip()
         file.close()
 
-        if (output.find(firstline) == -1):
+        if re.search(firstline, output) == None:
             sys.stderr.write("Didn't see expected error message %s from test %s.\nActual output:\n%s\n" % \
                 (firstline, testname, output))
             return (1, 0)
diff --git a/stdlib.ispc b/stdlib.ispc
index 4e06f5da..dc94d7e3 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,12 +38,20 @@
            ispc code 
 */
 
-#ifdef ISPC_TARGET_GENERIC
-#define IntMaskType bool
-#define UIntMaskType bool
+#if (ISPC_MASK_BITS == 1)
+  #define IntMaskType bool
+  #define UIntMaskType bool
+#elif (ISPC_MASK_BITS == 8)
+  #define IntMaskType int8
+  #define UIntMaskType unsigned int8
+#elif (ISPC_MASK_BITS == 16)
+  #define IntMaskType int16
+  #define UIntMaskType unsigned int16
+#elif (ISPC_MASK_BITS == 32)
+  #define IntMaskType int32
+  #define UIntMaskType unsigned int32
 #else
-#define IntMaskType int32
-#define UIntMaskType unsigned int32
+  #error Unknown value of ISPC_MASK_BITS
 #endif
 
 ///////////////////////////////////////////////////////////////////////////
@@ -335,14 +343,15 @@ static inline int32 sign_extend(bool v) {
     return __sext_varying_bool(v);
 }
 
+
 __declspec(safe) 
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __any(v & __mask);
 #else
-    return __any(__sext_varying_bool(v) & __mask);
+    return __any((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -350,11 +359,10 @@ __declspec(safe)
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __all(v | !__mask);
 #else
-    return __all(__sext_varying_bool(v) | !__mask);
+    return __all((UIntMaskType)__sext_varying_bool(v) | !__mask);
 #endif
 }
 
@@ -362,11 +370,10 @@ __declspec(safe)
 static inline uniform bool none(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __none(v & __mask);
 #else
-    return __none(__sext_varying_bool(v) & __mask);
+    return __none((UIntMaskType)__sext_varying_bool(v) & __mask);
 #endif
 }
 
@@ -399,10 +406,10 @@ static inline int popcnt(int64 v) {
 __declspec(safe) 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-#ifdef ISPC_TARGET_GENERIC
+#if (ISPC_MASK_BITS == 1)
     return __popcnt_int64(__movmsk(v & __mask));
 #else
-    return __popcnt_int64(__movmsk(__sext_varying_bool(v) & __mask));
+    return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
 #endif
 }
 
@@ -880,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a,
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
 
+__declspec(safe)
+static inline uniform int16 reduce_add(int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
+    return __reduce_add_int8(__mask ? x : (int8)0);
+}
+
+__declspec(safe)
+static inline uniform int32 reduce_add(int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
+__declspec(safe)
+static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
+    return __reduce_add_int16(__mask ? x : (int16)0);
+}
+
 __declspec(safe) 
 static inline uniform float reduce_add(float x) {
     // zero the lanes where the mask is off
     return __reduce_add_float(__mask ? x : 0.);
 }
 
-
 __declspec(safe) 
 static inline uniform float reduce_min(float v) {
     // For the lanes where the mask is off, replace the given value with
@@ -908,7 +934,7 @@ static inline uniform float reduce_max(float v) {
 }
 
 __declspec(safe) 
-static inline uniform int reduce_add(int x) {
+static inline uniform int64 reduce_add(int32 x) {
     // Zero out the values for lanes that aren't running
     return __reduce_add_int32(__mask ? x : 0);
 }
@@ -930,7 +956,7 @@ static inline uniform int reduce_max(int v) {
 }
 
 __declspec(safe) 
-static inline uniform unsigned int reduce_add(unsigned int x) {
+static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
     return __reduce_add_int32(__mask ? x : 0);
@@ -1325,88 +1351,88 @@ static inline uniform double max(uniform double a, uniform double b) {
 
 // int8
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 min(uniform unsigned int8 a,
                                         uniform unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2) 
+__declspec(safe,cost1)
 static inline uniform unsigned int8 max(uniform unsigned int8 a, 
                                         uniform unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 min(uniform int8 a, uniform int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int8 max(uniform int8 a, uniform int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 min(int8 a, int8 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int8 max(int8 a, int8 b) {
     return (a > b) ? a : b;
 }
 
 // int16
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 min(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform unsigned int16 max(uniform unsigned int16 a, 
                                          uniform unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 min(uniform int16 a, uniform int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline uniform int16 max(uniform int16 a, uniform int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
     return (a > b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 min(int16 a, int16 b) {
     return (a < b) ? a : b;
 }
 
-__declspec(safe,cost2)
+__declspec(safe,cost1)
 static inline int16 max(int16 a, int16 b) {
     return (a > b) ? a : b;
 }
@@ -2133,10 +2159,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 
 __declspec(safe)
 static inline float sin(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_sinf(extract(x_full, i));
@@ -2195,8 +2218,7 @@ static inline float sin(float x_full) {
 
 __declspec(safe)
 static inline uniform float sin(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_sinf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2267,8 +2289,7 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2324,8 +2345,7 @@ static inline uniform float asin(uniform float x) {
     uniform bool isnan = (x > 1);
 
     uniform float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_asinf(x);
     }
     else if (__math_lib == __math_lib_ispc)
@@ -2370,10 +2390,7 @@ static inline uniform float asin(uniform float x) {
 
 __declspec(safe)
 static inline float cos(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_cosf(extract(x_full, i));
@@ -2431,8 +2448,7 @@ static inline float cos(float x_full) {
 
 __declspec(safe)
 static inline uniform float cos(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_cosf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2509,10 +2525,7 @@ static inline uniform float acos(uniform float v) {
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
-    if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         foreach_active (i) {
             uniform float s, c;
             __stdlib_sincosf(extract(x_full, i), &s, &c);
@@ -2579,8 +2592,7 @@ static inline void sincos(float x_full, varying float * uniform sin_result,
 __declspec(safe)
 static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
                           uniform float * uniform cos_result) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         __stdlib_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2641,10 +2653,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 
 __declspec(safe)
 static inline float tan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_tanf(extract(x_full, i));
@@ -2720,8 +2729,7 @@ static inline float tan(float x_full) {
 
 __declspec(safe)
 static inline uniform float tan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_tanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2792,10 +2800,7 @@ static inline uniform float tan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan(float x_full) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atanf(extract(x_full, i));
@@ -2843,8 +2848,7 @@ static inline float atan(float x_full) {
 
 __declspec(safe)
 static inline uniform float atan(uniform float x_full) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2887,10 +2891,7 @@ static inline uniform float atan(uniform float x_full) {
 
 __declspec(safe)
 static inline float atan2(float y, float x) {
-    if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
-    }
-    else if (__math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
@@ -2926,8 +2927,7 @@ static inline float atan2(float y, float x) {
 
 __declspec(safe)
 static inline uniform float atan2(uniform float y, uniform float x) {
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -2950,9 +2950,6 @@ static inline float exp(float x_full) {
     if (__have_native_transcendentals) {
         return __exp_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3032,8 +3029,7 @@ static inline uniform float exp(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __exp_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_expf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3119,7 +3115,7 @@ static inline void __range_reduce_log(float input, varying float * uniform reduc
     static const int nonexponent_mask = 0x807FFFFF;
 
     // We want the reduced version to have an exponent of -1 which is -1 + 127 after biasing or 126
-    static const int exponent_neg1 = (126 << 23);
+    static const int exponent_neg1 = (126l << 23);
     // NOTE(boulos): We don't need to mask anything out since we know
     // the sign bit has to be 0. If it's 1, we need to return infinity/nan
     // anyway (log(x), x = +-0 -> infinity, x < 0 -> NaN).
@@ -3142,7 +3138,7 @@ static inline void __range_reduce_log(uniform float input, uniform float * unifo
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
-    static const uniform int exponent_neg1 = (126 << 23);
+    static const uniform int exponent_neg1 = (126ul << 23);
     uniform int biased_exponent = int_version >> 23;
     uniform int offset_exponent = biased_exponent + 1;
     *exponent = offset_exponent - 127; // get the real value
@@ -3157,9 +3153,6 @@ static inline float log(float x_full) {
     if (__have_native_transcendentals) {
         return __log_varying_float(x_full);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3248,8 +3241,7 @@ static inline uniform float log(uniform float x_full) {
     if (__have_native_transcendentals) {
         return __log_uniform_float(x_full);
     }
-    else if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    else if (__math_lib == __math_lib_system) {
         return __stdlib_logf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
@@ -3332,9 +3324,6 @@ static inline float pow(float a, float b) {
     if (__have_native_transcendentals) {
         return __pow_varying_float(a, b);
     }
-    else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
-    }
     else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
@@ -3354,8 +3343,7 @@ static inline uniform float pow(uniform float a, uniform float b) {
     if (__have_native_transcendentals) {
         return __pow_uniform_float(a, b);
     }
-    if (__math_lib == __math_lib_system ||
-        __math_lib == __math_lib_svml) {
+    if (__math_lib == __math_lib_system) {
         return __stdlib_powf(a, b);
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -3640,18 +3628,18 @@ static inline uniform float half_to_float(uniform unsigned int16 h) {
     else {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
-        static const uniform unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        static const uniform unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
         uniform int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
         uniform unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (uniform int32)(127 - 15) << 23;        // exponent adjust
 
         // handle exponent special cases
         if (exp == shifted_exp) // Inf/NaN?
-            o += (128 - 16) << 23;    // extra exp adjust
+            o += (uniform unsigned int32)(128 - 16) << 23;    // extra exp adjust
         else if (exp == 0) { // Zero/Denormal?
-            o += 1 << 23;             // extra exp adjust
-            o = intbits(floatbits(o) - floatbits(113 << 23)); // renormalize
+            o += 1ul << 23;             // extra exp adjust
+            o = intbits(floatbits(o) - floatbits(113ul << 23)); // renormalize
         }
 
         o |= ((int32)(h & 0x8000)) << 16;    // sign bit
@@ -3668,17 +3656,17 @@ static inline float half_to_float(unsigned int16 h) {
         // https://gist.github.com/2144712
         // Fabian "ryg" Giesen.
 
-        const unsigned int32 shifted_exp = 0x7c00 << 13; // exponent mask after shift
+        const unsigned int32 shifted_exp = 0x7c00ul << 13; // exponent mask after shift
 
-        int32 o = ((int32)(h & 0x7fff)) << 13;     // exponent/mantissa bits
+        int32 o = ((int32)(h & 0x7ffful)) << 13;     // exponent/mantissa bits
         unsigned int32 exp = shifted_exp & o;   // just the exponent
-        o += (127 - 15) << 23;        // exponent adjust
+        o += (int32)(127 - 15) << 23;        // exponent adjust
 
-        int32 infnan_val = o + ((128 - 16) << 23);
-        int32 zerodenorm_val = intbits(floatbits(o + (1<<23)) - floatbits(113 << 23));
+        int32 infnan_val = o + ((int32)(128 - 16) << 23);
+        int32 zerodenorm_val = intbits(floatbits(o + (1ul<<23)) - floatbits(113ul << 23));
         int32 reg_val = (exp == 0) ? zerodenorm_val : o;
 
-        int32 sign_bit = ((int32)(h & 0x8000)) << 16;
+        int32 sign_bit = ((int32)(h & 0x8000ul)) << 16;
         return floatbits(((exp == shifted_exp) ? infnan_val : reg_val) | sign_bit);
     }
 }
@@ -3708,16 +3696,16 @@ static inline uniform int16 float_to_half(uniform float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        uniform int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        uniform int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u; 
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const uniform unsigned int32 round_mask = ~0xfffu; 
-        const uniform int32 magic = 15 << 23;
-        const uniform int32 f16infty = 31 << 23;
+        const uniform unsigned int32 round_mask = ~0xffful;
+        const uniform int32 magic = 15ul << 23;
+        const uniform int32 f16infty = 31ul << 23;
 
         uniform int32 fint2 = intbits(floatbits(fint & round_mask) * floatbits(magic)) - round_mask;
         fint2 = (fint2 > f16infty) ? f16infty : fint2; // Clamp to signed infinity if overflowed
@@ -3754,16 +3742,16 @@ static inline int16 float_to_half(float f) {
         // NaN->qNaN and Inf->Inf
         // unconditional assignment here, will override with right value for
         // the regular case below.
-        int32 f32infty = 255 << 23;
-        o = (fint > f32infty) ? 0x7e00 : 0x7c00; 
+        int32 f32infty = 255ul << 23;
+        o = (fint > f32infty) ? 0x7e00u : 0x7c00u;
 
         // (De)normalized number or zero
         // update fint unconditionally to save the blending; we don't need it
         // anymore for the Inf/NaN case anyway.
 
-        const unsigned int32 round_mask = ~0xfffu; 
-        const int32 magic = 15 << 23;
-        const int32 f16infty = 31 << 23;
+        const unsigned int32 round_mask = ~0xffful;
+        const int32 magic = 15ul << 23;
+        const int32 f16infty = 31ul << 23;
 
         // Shift exponent down, denormalize if necessary.
         // NOTE This represents half-float denormals using single precision denormals.
@@ -3782,7 +3770,7 @@ static inline int16 float_to_half(float f) {
         //   FP16 denormals are rare in practice, I don't know. Whatever slow path your HW
         //   may or may not have for denormals, this may well hit it.
         float fscale = floatbits(fint & round_mask) * floatbits(magic);
-        fscale = min(fscale, floatbits((31 << 23) - 0x1000));
+        fscale = min(fscale, floatbits((31ul << 23) - 0x1000ul));
         int32 fint2 = intbits(fscale) - round_mask;
 
         if (fint < f32infty)
@@ -3949,7 +3937,7 @@ float_to_srgb8(float inval)
     // Do the table lookup and unpack bias, scale
     unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     unsigned int bias = (tab >> 16) << 9;
-    unsigned int scale = tab & 0xffff;
+    unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -3999,7 +3987,7 @@ float_to_srgb8(uniform float inval)
     // Do the table lookup and unpack bias, scale
     uniform unsigned int tab = table[(intbits(inval) - 0x39000000u) >> 20];
     uniform unsigned int bias = (tab >> 16) << 9;
-    uniform unsigned int scale = tab & 0xffff;
+    uniform unsigned int scale = tab & 0xfffful;
 
     // Grab next-highest mantissa bits and perform linear interpolation
     uniform unsigned int t = (intbits(inval) >> 12) & 0xff;
@@ -4046,14 +4034,14 @@ static inline uniform unsigned int random(uniform RNGState * uniform state)
 static inline float frandom(varying RNGState * uniform state)
 {
     unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
 static inline uniform float frandom(uniform RNGState * uniform state)
 {
     uniform unsigned int irand = random(state);
-    irand &= (1<<23)-1;
+    irand &= (1ul<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
@@ -4061,18 +4049,18 @@ static inline void seed_rng(varying RNGState * uniform state,
                             unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 static inline void seed_rng(uniform RNGState * uniform state, 
                             uniform unsigned int seed) {
     state->z1 = seed;
     state->z2 = seed ^ 0xbeeff00d;
-    state->z3 = ((seed & 0xffff) << 16) | (seed >> 16);
-    state->z4 = (((seed & 0xff) << 24) | ((seed & 0xff00)  << 8) |
-                 ((seed & 0xff0000) >> 8) | (seed & 0xff000000) >> 24);
+    state->z3 = ((seed & 0xfffful) << 16) | (seed >> 16);
+    state->z4 = (((seed & 0xfful) << 24) | ((seed & 0xff00ul)  << 8) |
+                 ((seed & 0xff0000ul) >> 8) | (seed & 0xff000000ul) >> 24);
 }
 
 
@@ -4090,7 +4078,7 @@ static inline uniform bool rdrand(float * uniform ptr) {
         uniform int32 irand;
         uniform bool success = __rdrand_i32(&irand);
         if (success) {
-            irand &= (1<<23)-1;
+            irand &= (1ul<<23)-1;
             *ptr = floatbits(0x3F800000 | irand)-1.0f;
         }
         return success;
@@ -4110,7 +4098,7 @@ static inline bool rdrand(varying float * uniform ptr) {
                 // in vector form.  However, we need to be careful to not
                 // clobber any existing already-set values in *ptr with
                 // inactive lanes here...
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptr = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4130,7 +4118,7 @@ static inline bool rdrand(float * ptr) {
         foreach_active (index) {
             uniform int32 irand;
             if (__rdrand_i32(&irand)) {
-                irand &= (1<<23)-1;
+                irand &= (1ul<<23)-1;
                 *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
                 success = true;
             }
@@ -4264,3 +4252,720 @@ static inline bool rdrand(int64 * ptr) {
         return success;
     }
 }
+
+///////////////////////////////////////////////////////////////////////////
+// Fast vector integer division
+
+/* These tables and the algorithms in the __fast_idiv() functions below are
+   from Halide; the idea is based on the paper "Division by Invariant
+   Integers using Multiplication" by Granlund and Montgomery.
+
+   Copyright (c) 2012 MIT CSAIL
+
+   Developed by:
+
+   The Halide team
+   MIT CSAIL
+   http://halide-lang.org
+
+   Permission is hereby granted, free of charge, to any person obtaining a
+   copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be included
+   in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+   OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+static const uniform int64 __idiv_table_u8[][3] = {
+    {0,          0LL,  1},     {1,        171LL,  1},     {0,          0LL,  2},
+    {1,        205LL,  2},     {1,        171LL,  2},     {2,         37LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        205LL,  3},
+    {2,        117LL,  3},     {1,        171LL,  3},     {1,         79LL,  2},
+    {2,         37LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        241LL,  4},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        205LL,  4},     {2,        135LL,  4},     {2,        117LL,  4},
+    {2,        101LL,  4},     {1,        171LL,  4},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {2,         37LL,  4},
+    {2,         27LL,  4},     {1,        137LL,  4},     {2,          9LL,  4},
+    {0,          0LL,  5},     {1,        249LL,  5},     {1,        241LL,  5},
+    {1,        235LL,  5},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {2,        165LL,  5},     {1,        205LL,  5},
+    {1,         25LL,  2},     {2,        135LL,  5},     {1,        191LL,  5},
+    {1,        187LL,  5},     {2,        109LL,  5},     {2,        101LL,  5},
+    {1,        175LL,  5},     {1,        171LL,  5},     {2,         79LL,  5},
+    {1,         41LL,  3},     {1,        161LL,  5},     {1,         79LL,  4},
+    {1,        155LL,  5},     {1,         19LL,  2},     {1,        149LL,  5},
+    {2,         37LL,  5},     {1,          9LL,  1},     {2,         27LL,  5},
+    {1,        139LL,  5},     {1,        137LL,  5},     {2,         13LL,  5},
+    {2,          9LL,  5},     {2,          5LL,  5},     {0,          0LL,  6},
+    {1,        253LL,  6},     {1,        249LL,  6},     {1,        245LL,  6},
+    {1,        121LL,  5},     {1,        119LL,  5},     {1,        235LL,  6},
+    {1,        231LL,  6},     {1,         57LL,  4},     {1,        225LL,  6},
+    {1,        111LL,  5},     {1,        219LL,  6},     {1,         27LL,  3},
+    {1,        213LL,  6},     {2,        165LL,  6},     {1,         13LL,  2},
+    {1,        205LL,  6},     {1,        203LL,  6},     {1,         25LL,  3},
+    {1,         99LL,  5},     {2,        135LL,  6},     {1,        193LL,  6},
+    {1,        191LL,  6},     {1,        189LL,  6},     {1,        187LL,  6},
+    {1,        185LL,  6},     {1,        183LL,  6},     {1,        181LL,  6},
+    {1,        179LL,  6},     {1,        177LL,  6},     {1,        175LL,  6},
+    {1,        173LL,  6},     {1,        171LL,  6},     {1,        169LL,  6},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,        161LL,  6},     {2,         63LL,  6},
+    {1,         79LL,  5},     {2,         57LL,  6},     {1,        155LL,  6},
+    {2,         51LL,  6},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,        149LL,  6},     {1,         37LL,  4},     {2,         37LL,  6},
+    {1,        145LL,  6},     {1,          9LL,  2},     {1,        143LL,  6},
+    {2,         27LL,  6},     {2,         25LL,  6},     {1,        139LL,  6},
+    {1,         69LL,  5},     {1,        137LL,  6},     {2,         15LL,  6},
+    {2,         13LL,  6},     {2,         11LL,  6},     {2,          9LL,  6},
+    {2,          7LL,  6},     {2,          5LL,  6},     {2,          3LL,  6},
+    {0,          0LL,  7},     {1,        255LL,  7},     {1,        127LL,  6},
+    {1,         63LL,  5},     {1,        125LL,  6},     {1,         31LL,  4},
+    {1,        123LL,  6},     {1,         61LL,  5},     {1,        121LL,  6},
+    {1,         15LL,  3},     {1,        119LL,  6},     {1,         59LL,  5},
+    {1,        235LL,  7},     {1,        117LL,  6},     {1,         29LL,  4},
+    {1,        115LL,  6},     {1,         57LL,  5},     {1,        113LL,  6},
+    {1,        225LL,  7},     {1,          7LL,  2},     {1,        111LL,  6},
+    {1,         55LL,  5},     {1,        219LL,  7},     {1,        109LL,  6},
+    {1,         27LL,  4},     {1,        215LL,  7},     {1,        107LL,  6},
+    {1,         53LL,  5},     {1,        211LL,  7},     {1,        105LL,  6},
+    {1,         13LL,  3},     {1,        207LL,  7},     {1,        103LL,  6},
+    {1,         51LL,  5},     {1,        203LL,  7},     {1,        101LL,  6},
+    {1,         25LL,  4},     {1,        199LL,  7},     {1,         99LL,  6},
+    {1,        197LL,  7},     {1,         49LL,  5},     {1,         97LL,  6},
+    {1,        193LL,  7},     {1,          3LL,  1},     {1,        191LL,  7},
+    {1,         95LL,  6},     {1,        189LL,  7},     {1,         47LL,  5},
+    {1,        187LL,  7},     {1,         93LL,  6},     {1,        185LL,  7},
+    {1,         23LL,  4},     {1,        183LL,  7},     {1,         91LL,  6},
+    {1,        181LL,  7},     {1,         45LL,  5},     {1,        179LL,  7},
+    {1,         89LL,  6},     {1,        177LL,  7},     {1,         11LL,  3},
+    {1,        175LL,  7},     {1,         87LL,  6},     {1,        173LL,  7},
+    {1,         43LL,  5},     {1,        171LL,  7},     {1,         85LL,  6},
+    {1,        169LL,  7},     {2,         81LL,  7},     {1,         21LL,  4},
+    {1,        167LL,  7},     {1,         83LL,  6},     {1,        165LL,  7},
+    {1,         41LL,  5},     {2,         71LL,  7},     {1,        163LL,  7},
+    {1,         81LL,  6},     {1,        161LL,  7},     {1,          5LL,  2},
+    {2,         63LL,  7},     {1,        159LL,  7},     {1,         79LL,  6},
+    {1,        157LL,  7},     {2,         57LL,  7},     {1,         39LL,  5},
+    {1,        155LL,  7},     {1,         77LL,  6},     {2,         51LL,  7},
+    {1,        153LL,  7},     {1,         19LL,  4},     {2,         47LL,  7},
+    {1,        151LL,  7},     {1,         75LL,  6},     {1,        149LL,  7},
+    {2,         41LL,  7},     {1,         37LL,  5},     {1,        147LL,  7},
+    {2,         37LL,  7},     {1,         73LL,  6},     {1,        145LL,  7},
+    {2,         33LL,  7},     {1,          9LL,  3},     {2,         31LL,  7},
+    {1,        143LL,  7},     {1,         71LL,  6},     {2,         27LL,  7},
+    {1,        141LL,  7},     {2,         25LL,  7},     {1,         35LL,  5},
+    {1,        139LL,  7},     {2,         21LL,  7},     {1,         69LL,  6},
+    {2,         19LL,  7},     {1,        137LL,  7},     {1,         17LL,  4},
+    {2,         15LL,  7},     {1,        135LL,  7},     {2,         13LL,  7},
+    {1,         67LL,  6},     {2,         11LL,  7},     {1,        133LL,  7},
+    {2,          9LL,  7},     {1,         33LL,  5},     {2,          7LL,  7},
+    {1,        131LL,  7},     {2,          5LL,  7},     {1,         65LL,  6},
+    {2,          3LL,  7},     {1,        129LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s8[][3] = {
+    {0,          0LL,  1},     {1,         86LL,  0},     {0,          0LL,  2},
+    {1,        103LL,  1},     {1,         43LL,  0},     {1,        147LL,  2},
+    {0,          0LL,  3},     {1,         57LL,  1},     {1,        103LL,  2},
+    {1,        187LL,  3},     {1,         43LL,  1},     {1,         79LL,  2},
+    {1,        147LL,  3},     {1,        137LL,  3},     {0,          0LL,  4},
+    {1,        121LL,  3},     {1,         57LL,  2},     {1,         27LL,  1},
+    {1,        103LL,  3},     {1,         49LL,  2},     {1,        187LL,  4},
+    {1,        179LL,  4},     {1,         43LL,  2},     {1,         41LL,  2},
+    {1,         79LL,  3},     {1,         19LL,  1},     {1,        147LL,  4},
+    {1,         71LL,  3},     {1,        137LL,  4},     {1,        133LL,  4},
+    {0,          0LL,  5},     {1,        125LL,  4},     {1,        121LL,  4},
+    {1,         59LL,  3},     {1,         57LL,  3},     {1,        111LL,  4},
+    {1,         27LL,  2},     {1,        211LL,  5},     {1,        103LL,  4},
+    {1,         25LL,  2},     {1,         49LL,  3},     {1,          6LL,  0},
+    {1,         47LL,  3},     {1,         23LL,  2},     {1,         45LL,  3},
+    {1,         11LL,  1},     {1,         43LL,  3},     {1,         21LL,  2},
+    {1,         41LL,  3},     {1,         81LL,  4},     {1,         79LL,  4},
+    {1,         39LL,  3},     {1,         19LL,  2},     {1,         75LL,  4},
+    {1,        147LL,  5},     {1,          9LL,  1},     {1,         71LL,  4},
+    {1,         35LL,  3},     {1,        137LL,  5},     {1,        135LL,  5},
+    {1,        133LL,  5},     {1,        131LL,  5},     {0,          0LL,  6},
+    {1,        127LL,  5},     {1,         63LL,  4},     {1,         31LL,  3},
+    {1,         61LL,  4},     {1,         15LL,  2},     {1,         59LL,  4},
+    {1,         29LL,  3},     {1,         57LL,  4},     {1,        113LL,  5},
+    {1,          7LL,  1},     {1,         55LL,  4},     {1,         27LL,  3},
+    {1,        107LL,  5},     {1,         53LL,  4},     {1,         13LL,  2},
+    {1,        103LL,  5},     {1,         51LL,  4},     {1,         25LL,  3},
+    {1,         99LL,  5},     {1,         49LL,  4},     {1,         97LL,  5},
+    {1,          3LL,  0},     {1,         95LL,  5},     {1,         47LL,  4},
+    {1,         93LL,  5},     {1,         23LL,  3},     {1,         91LL,  5},
+    {1,         45LL,  4},     {1,         89LL,  5},     {1,         11LL,  2},
+    {1,         87LL,  5},     {1,         43LL,  4},     {1,         85LL,  5},
+    {1,         21LL,  3},     {1,         83LL,  5},     {1,         41LL,  4},
+    {1,        163LL,  6},     {1,         81LL,  5},     {1,          5LL,  1},
+    {1,         79LL,  5},     {1,        157LL,  6},     {1,         39LL,  4},
+    {1,         77LL,  5},     {1,         19LL,  3},     {1,        151LL,  6},
+    {1,         75LL,  5},     {1,         37LL,  4},     {1,        147LL,  6},
+    {1,         73LL,  5},     {1,          9LL,  2},     {1,        143LL,  6},
+    {1,         71LL,  5},     {1,        141LL,  6},     {1,         35LL,  4},
+    {1,         69LL,  5},     {1,        137LL,  6},     {1,         17LL,  3},
+    {1,        135LL,  6},     {1,         67LL,  5},     {1,        133LL,  6},
+    {1,         33LL,  4},     {1,        131LL,  6},     {1,         65LL,  5},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+    {0,          0LL,  7},     {0,          0LL,  7},     {0,          0LL,  7},
+};
+static const uniform int64 __idiv_table_u16[][3] = {
+    {0,          0LL,  1},     {1,      43691LL,  1},     {0,          0LL,  2},
+    {1,      52429LL,  2},     {1,      43691LL,  2},     {2,       9363LL,  2},
+    {0,          0LL,  3},     {1,      58255LL,  3},     {1,      52429LL,  3},
+    {1,      47663LL,  3},     {1,      43691LL,  3},     {1,      20165LL,  2},
+    {2,       9363LL,  3},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      61681LL,  4},     {1,      58255LL,  4},     {1,      55189LL,  4},
+    {1,      52429LL,  4},     {2,      34329LL,  4},     {1,      47663LL,  4},
+    {2,      25645LL,  4},     {1,      43691LL,  4},     {2,      18351LL,  4},
+    {1,      20165LL,  3},     {2,      12137LL,  4},     {2,       9363LL,  4},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {2,       2115LL,  4},
+    {0,          0LL,  5},     {1,      63551LL,  5},     {1,      61681LL,  5},
+    {1,      59919LL,  5},     {1,      58255LL,  5},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {2,      42011LL,  5},     {1,      52429LL,  5},
+    {2,      36765LL,  5},     {2,      34329LL,  5},     {1,      48771LL,  5},
+    {1,      47663LL,  5},     {1,      11651LL,  3},     {2,      25645LL,  5},
+    {2,      23705LL,  5},     {1,      43691LL,  5},     {2,      20063LL,  5},
+    {2,      18351LL,  5},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      39569LL,  5},     {2,      12137LL,  5},     {2,      10725LL,  5},
+    {2,       9363LL,  5},     {2,       8049LL,  5},     {1,      18079LL,  4},
+    {1,      35545LL,  5},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {2,       2115LL,  5},     {2,       1041LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,      63551LL,  6},     {1,      31301LL,  5},
+    {1,      61681LL,  6},     {2,      56039LL,  6},     {1,      59919LL,  6},
+    {1,      59075LL,  6},     {1,      58255LL,  6},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {2,      46313LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {2,      42011LL,  6},     {1,      53093LL,  6},
+    {1,      52429LL,  6},     {1,      25891LL,  5},     {2,      36765LL,  6},
+    {1,      25267LL,  5},     {2,      34329LL,  6},     {1,      49345LL,  6},
+    {1,      48771LL,  6},     {1,      48211LL,  6},     {1,      47663LL,  6},
+    {2,      28719LL,  6},     {1,      11651LL,  4},     {2,      26647LL,  6},
+    {2,      25645LL,  6},     {2,      24665LL,  6},     {2,      23705LL,  6},
+    {1,      44151LL,  6},     {1,      43691LL,  6},     {2,      20945LL,  6},
+    {2,      20063LL,  6},     {1,      42367LL,  6},     {2,      18351LL,  6},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      39569LL,  6},
+    {2,      12863LL,  6},     {2,      12137LL,  6},     {1,       2405LL,  2},
+    {2,      10725LL,  6},     {1,      37787LL,  6},     {2,       9363LL,  6},
+    {1,      18559LL,  5},     {2,       8049LL,  6},     {2,       7409LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      35545LL,  6},
+    {2,       4957LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {2,       2665LL,  6},     {2,       2115LL,  6},
+    {2,       1573LL,  6},     {2,       1041LL,  6},     {2,        517LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,      63551LL,  7},     {1,      63073LL,  7},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      61681LL,  7},
+    {1,      61231LL,  7},     {2,      56039LL,  7},     {1,      30175LL,  6},
+    {1,      59919LL,  7},     {1,      29747LL,  6},     {1,      59075LL,  7},
+    {1,      29331LL,  6},     {1,      58255LL,  7},     {1,      57853LL,  7},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {2,      46313LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {2,      42705LL,  7},     {2,      42011LL,  7},     {1,      53431LL,  7},
+    {1,      53093LL,  7},     {1,      52759LL,  7},     {1,      52429LL,  7},
+    {2,      38671LL,  7},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {2,      36765LL,  7},     {2,      36145LL,  7},     {1,      25267LL,  6},
+    {2,      34927LL,  7},     {2,      34329LL,  7},     {1,      49637LL,  7},
+    {1,      49345LL,  7},     {2,      32577LL,  7},     {1,      48771LL,  7},
+    {2,      31443LL,  7},     {1,      48211LL,  7},     {1,      47935LL,  7},
+    {1,      47663LL,  7},     {2,      29251LL,  7},     {2,      28719LL,  7},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {2,      26647LL,  7},     {1,       2865LL,  3},     {2,      25645LL,  7},
+    {1,       1417LL,  2},     {2,      24665LL,  7},     {1,      44859LL,  7},
+    {2,      23705LL,  7},     {2,      23233LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      43691LL,  7},     {2,      21393LL,  7},
+    {2,      20945LL,  7},     {1,      43019LL,  7},     {2,      20063LL,  7},
+    {1,      21291LL,  6},     {1,      42367LL,  7},     {1,      21077LL,  6},
+    {2,      18351LL,  7},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {2,      17111LL,  7},     {1,      41121LL,  7},     {2,      16305LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      40137LL,  7},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      39569LL,  7},     {2,      13231LL,  7},     {2,      12863LL,  7},
+    {1,      39017LL,  7},     {2,      12137LL,  7},     {2,      11779LL,  7},
+    {1,       2405LL,  3},     {2,      11073LL,  7},     {2,      10725LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {2,       9699LL,  7},
+    {2,       9363LL,  7},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {2,       8373LL,  7},     {2,       8049LL,  7},     {1,       4579LL,  4},
+    {2,       7409LL,  7},     {2,       7093LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {2,       5857LL,  7},
+    {1,      35545LL,  7},     {1,      35395LL,  7},     {2,       4957LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {2,       3507LL,  7},     {1,       8595LL,  5},
+    {2,       2943LL,  7},     {2,       2665LL,  7},     {1,      16981LL,  6},
+    {2,       2115LL,  7},     {2,       1843LL,  7},     {2,       1573LL,  7},
+    {1,      33421LL,  7},     {2,       1041LL,  7},     {1,      33157LL,  7},
+    {2,        517LL,  7},     {1,      32897LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s16[][3] = {
+    {0,          0LL,  1},     {1,      21846LL,  0},     {0,          0LL,  2},
+    {1,      26215LL,  1},     {1,      10923LL,  0},     {1,      18725LL,  1},
+    {0,          0LL,  3},     {1,       7282LL,  0},     {1,      26215LL,  2},
+    {1,       5958LL,  0},     {1,      10923LL,  1},     {1,      20165LL,  2},
+    {1,      18725LL,  2},     {1,      34953LL,  3},     {0,          0LL,  4},
+    {1,      30841LL,  3},     {1,       3641LL,  0},     {1,      55189LL,  4},
+    {1,      26215LL,  3},     {1,      49933LL,  4},     {1,       2979LL,  0},
+    {1,      45591LL,  4},     {1,      10923LL,  2},     {1,       5243LL,  1},
+    {1,      20165LL,  3},     {1,      38837LL,  4},     {1,      18725LL,  3},
+    {1,      18079LL,  3},     {1,      34953LL,  4},     {1,      16913LL,  3},
+    {0,          0LL,  5},     {1,       1986LL,  0},     {1,      30841LL,  4},
+    {1,       3745LL,  1},     {1,       3641LL,  1},     {1,       7085LL,  2},
+    {1,      55189LL,  5},     {1,      26887LL,  4},     {1,      26215LL,  4},
+    {1,      51151LL,  5},     {1,      49933LL,  5},     {1,      12193LL,  3},
+    {1,       2979LL,  1},     {1,      11651LL,  3},     {1,      45591LL,  5},
+    {1,      44621LL,  5},     {1,      10923LL,  3},     {1,       2675LL,  1},
+    {1,       5243LL,  2},     {1,      41121LL,  5},     {1,      20165LL,  4},
+    {1,      19785LL,  4},     {1,      38837LL,  5},     {1,      38131LL,  5},
+    {1,      18725LL,  4},     {1,      36793LL,  5},     {1,      18079LL,  4},
+    {1,      17773LL,  4},     {1,      34953LL,  5},     {1,       8595LL,  3},
+    {1,      16913LL,  4},     {1,      33289LL,  5},     {0,          0LL,  6},
+    {1,       4033LL,  2},     {1,        993LL,  0},     {1,      31301LL,  5},
+    {1,      30841LL,  5},     {1,      15197LL,  4},     {1,       3745LL,  2},
+    {1,      14769LL,  4},     {1,       3641LL,  2},     {1,      57457LL,  6},
+    {1,       7085LL,  3},     {1,      55925LL,  6},     {1,      55189LL,  6},
+    {1,       6809LL,  3},     {1,      26887LL,  5},     {1,      26547LL,  5},
+    {1,      26215LL,  5},     {1,      25891LL,  5},     {1,      51151LL,  6},
+    {1,      25267LL,  5},     {1,      49933LL,  6},     {1,      24673LL,  5},
+    {1,      12193LL,  4},     {1,      48211LL,  6},     {1,       2979LL,  2},
+    {1,       5891LL,  3},     {1,      11651LL,  4},     {1,      11523LL,  4},
+    {1,      45591LL,  6},     {1,      45101LL,  6},     {1,      44621LL,  6},
+    {1,      44151LL,  6},     {1,      10923LL,  4},     {1,      43241LL,  6},
+    {1,       2675LL,  2},     {1,        662LL,  0},     {1,       5243LL,  3},
+    {1,       5191LL,  3},     {1,      41121LL,  6},     {1,      20361LL,  5},
+    {1,      20165LL,  5},     {1,      19973LL,  5},     {1,      19785LL,  5},
+    {1,       1225LL,  1},     {1,      38837LL,  6},     {1,       2405LL,  2},
+    {1,      38131LL,  6},     {1,      37787LL,  6},     {1,      18725LL,  5},
+    {1,      18559LL,  5},     {1,      36793LL,  6},     {1,      36473LL,  6},
+    {1,      18079LL,  5},     {1,      35849LL,  6},     {1,      17773LL,  5},
+    {1,      35247LL,  6},     {1,      34953LL,  6},     {1,       4333LL,  3},
+    {1,       8595LL,  4},     {1,      34101LL,  6},     {1,      16913LL,  5},
+    {1,      33555LL,  6},     {1,      33289LL,  6},     {1,      33027LL,  6},
+    {0,          0LL,  7},     {1,      16257LL,  5},     {1,       4033LL,  3},
+    {1,      16009LL,  5},     {1,        993LL,  1},     {1,      31537LL,  6},
+    {1,      31301LL,  6},     {1,      31069LL,  6},     {1,      30841LL,  6},
+    {1,       3827LL,  3},     {1,      15197LL,  5},     {1,      30175LL,  6},
+    {1,       3745LL,  3},     {1,      29747LL,  6},     {1,      14769LL,  5},
+    {1,      29331LL,  6},     {1,       3641LL,  3},     {1,      28927LL,  6},
+    {1,      57457LL,  7},     {1,      28533LL,  6},     {1,       7085LL,  4},
+    {1,      14075LL,  5},     {1,      55925LL,  7},     {1,      27777LL,  6},
+    {1,      55189LL,  7},     {1,      13707LL,  5},     {1,       6809LL,  4},
+    {1,      54121LL,  7},     {1,      26887LL,  6},     {1,       6679LL,  4},
+    {1,      26547LL,  6},     {1,       6595LL,  4},     {1,      26215LL,  6},
+    {1,       6513LL,  4},     {1,      25891LL,  6},     {1,       6433LL,  4},
+    {1,      51151LL,  7},     {1,      50841LL,  7},     {1,      25267LL,  6},
+    {1,       6279LL,  4},     {1,      49933LL,  7},     {1,      24819LL,  6},
+    {1,      24673LL,  6},     {1,      49057LL,  7},     {1,      12193LL,  5},
+    {1,      24245LL,  6},     {1,      48211LL,  7},     {1,        749LL,  1},
+    {1,       2979LL,  3},     {1,      23697LL,  6},     {1,       5891LL,  4},
+    {1,       2929LL,  3},     {1,      11651LL,  5},     {1,      23173LL,  6},
+    {1,      11523LL,  5},     {1,       2865LL,  3},     {1,      45591LL,  7},
+    {1,       1417LL,  2},     {1,      45101LL,  7},     {1,      11215LL,  5},
+    {1,      44621LL,  7},     {1,      44385LL,  7},     {1,      44151LL,  7},
+    {1,       2745LL,  3},     {1,      10923LL,  5},     {1,      43465LL,  7},
+    {1,      43241LL,  7},     {1,      43019LL,  7},     {1,       2675LL,  3},
+    {1,      21291LL,  6},     {1,        331LL,  0},     {1,      21077LL,  6},
+    {1,       5243LL,  4},     {1,      41735LL,  7},     {1,       5191LL,  4},
+    {1,      10331LL,  5},     {1,      41121LL,  7},     {1,      40921LL,  7},
+    {1,      20361LL,  6},     {1,      40525LL,  7},     {1,      20165LL,  6},
+    {1,      20069LL,  6},     {1,      19973LL,  6},     {1,      39757LL,  7},
+    {1,      19785LL,  6},     {1,       4923LL,  4},     {1,       1225LL,  2},
+    {1,      39017LL,  7},     {1,      38837LL,  7},     {1,      19329LL,  6},
+    {1,       2405LL,  3},     {1,      38305LL,  7},     {1,      38131LL,  7},
+    {1,      18979LL,  6},     {1,      37787LL,  7},     {1,      18809LL,  6},
+    {1,      18725LL,  6},     {1,      37283LL,  7},     {1,      18559LL,  6},
+    {1,      36955LL,  7},     {1,      36793LL,  7},     {1,       4579LL,  4},
+    {1,      36473LL,  7},     {1,      36315LL,  7},     {1,      18079LL,  6},
+    {1,      36003LL,  7},     {1,      35849LL,  7},     {1,      35697LL,  7},
+    {1,      17773LL,  6},     {1,       8849LL,  5},     {1,      35247LL,  7},
+    {1,      35099LL,  7},     {1,      34953LL,  7},     {1,       4351LL,  4},
+    {1,       4333LL,  4},     {1,      17261LL,  6},     {1,       8595LL,  5},
+    {1,        535LL,  1},     {1,      34101LL,  7},     {1,      16981LL,  6},
+    {1,      16913LL,  6},     {1,      16845LL,  6},     {1,      33555LL,  7},
+    {1,      33421LL,  7},     {1,      33289LL,  7},     {1,      33157LL,  7},
+    {1,      33027LL,  7},     {1,      32897LL,  7},     {1,      32769LL,  7},
+};
+static const uniform int64 __idiv_table_u32[][3] = {
+    {0,          0LL,  1},     {1, 2863311531LL,  1},     {0,          0LL,  2},
+    {1, 3435973837LL,  2},     {1, 2863311531LL,  2},     {2,  613566757LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 3435973837LL,  3},
+    {1, 3123612579LL,  3},     {1, 2863311531LL,  3},     {1, 1321528399LL,  2},
+    {2,  613566757LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 4042322161LL,  4},     {1,  954437177LL,  2},     {2, 2938661835LL,  4},
+    {1, 3435973837LL,  4},     {2, 2249744775LL,  4},     {1, 3123612579LL,  4},
+    {1, 2987803337LL,  4},     {1, 2863311531LL,  4},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {2,  795364315LL,  4},     {2,  613566757LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {2,  138547333LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 4042322161LL,  5},
+    {2, 3558687189LL,  5},     {1,  954437177LL,  3},     {2, 3134165325LL,  5},
+    {2, 2938661835LL,  5},     {2, 2753184165LL,  5},     {1, 3435973837LL,  5},
+    {1, 3352169597LL,  5},     {2, 2249744775LL,  5},     {1,  799063683LL,  3},
+    {1, 3123612579LL,  5},     {2, 1813430637LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1, 2863311531LL,  5},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {2,  891408307LL,  5},     {2,  795364315LL,  5},     {2,  702812831LL,  5},
+    {2,  613566757LL,  5},     {2,  527452125LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {2,  138547333LL,  5},     {2,   68174085LL,  5},     {0,          0LL,  6},
+    {1, 4228890877LL,  6},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 4042322161LL,  6},     {1, 1991868891LL,  5},     {2, 3558687189LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {2, 3235934265LL,  6},
+    {2, 3134165325LL,  6},     {1,  458129845LL,  3},     {2, 2938661835LL,  6},
+    {1,  892460737LL,  4},     {2, 2753184165LL,  6},     {1, 3479467177LL,  6},
+    {1, 3435973837LL,  6},     {1, 3393554407LL,  6},     {1, 3352169597LL,  6},
+    {1,  827945503LL,  4},     {2, 2249744775LL,  6},     {1, 3233857729LL,  6},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1, 3123612579LL,  6},
+    {1, 3088515809LL,  6},     {2, 1813430637LL,  6},     {2, 1746305385LL,  6},
+    {1, 2987803337LL,  6},     {1, 2955676419LL,  6},     {1, 2924233053LL,  6},
+    {2, 1491936009LL,  6},     {1, 2863311531LL,  6},     {2, 1372618415LL,  6},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {2, 1148159575LL,  6},     {1, 2694881441LL,  6},     {2, 1042467791LL,  6},
+    {1, 1321528399LL,  5},     {2,  940802361LL,  6},     {2,  891408307LL,  6},
+    {2,  842937507LL,  6},     {2,  795364315LL,  6},     {2,  748664025LL,  6},
+    {2,  702812831LL,  6},     {2,  657787785LL,  6},     {2,  613566757LL,  6},
+    {2,  570128403LL,  6},     {2,  527452125LL,  6},     {2,  485518043LL,  6},
+    {1, 2369637129LL,  6},     {2,  403800345LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {2,  248469183LL,  6},
+    {1, 1126548799LL,  5},     {2,  174592167LL,  6},     {2,  138547333LL,  6},
+    {1,  274877907LL,  3},     {2,   68174085LL,  6},     {2,   33818641LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 4228890877LL,  7},
+    {1, 4196609267LL,  7},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 4042322161LL,  7},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {2, 3558687189LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1, 3844446251LL,  7},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {2, 3235934265LL,  7},     {1, 3739835469LL,  7},     {2, 3134165325LL,  7},
+    {1, 3689636335LL,  7},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {2, 2938661835LL,  7},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1, 3546811703LL,  7},     {2, 2753184165LL,  7},     {1,  875407347LL,  5},
+    {1, 3479467177LL,  7},     {2, 2620200175LL,  7},     {1, 3435973837LL,  7},
+    {1, 3414632385LL,  7},     {1, 3393554407LL,  7},     {1, 3372735055LL,  7},
+    {1, 3352169597LL,  7},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {2, 2249744775LL,  7},     {1, 1626496491LL,  6},
+    {1, 3233857729LL,  7},     {2, 2134925265LL,  7},     {1,  799063683LL,  5},
+    {2, 2060591247LL,  7},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1, 3123612579LL,  7},     {2, 1916962805LL,  7},     {1, 3088515809LL,  7},
+    {2, 1847555765LL,  7},     {2, 1813430637LL,  7},     {1, 3037324939LL,  7},
+    {2, 1746305385LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {2, 1648338801LL,  7},     {1, 2955676419LL,  7},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {2, 1522554545LL,  7},     {2, 1491936009LL,  7},
+    {1, 2878302691LL,  7},     {1, 2863311531LL,  7},     {1,  356059465LL,  4},
+    {2, 1372618415LL,  7},     {2, 1343553873LL,  7},     {1, 1402438301LL,  6},
+    {2, 1286310003LL,  7},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {2, 1148159575LL,  7},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {2, 1042467791LL,  7},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {2,  940802361LL,  7},     {1, 2605477791LL,  7},
+    {2,  891408307LL,  7},     {1, 2581013211LL,  7},     {2,  842937507LL,  7},
+    {1, 1278501893LL,  6},     {2,  795364315LL,  7},     {2,  771906565LL,  7},
+    {2,  748664025LL,  7},     {2,  725633745LL,  7},     {2,  702812831LL,  7},
+    {2,  680198441LL,  7},     {2,  657787785LL,  7},     {2,  635578121LL,  7},
+    {2,  613566757LL,  7},     {1, 2443359173LL,  7},     {2,  570128403LL,  7},
+    {2,  548696263LL,  7},     {2,  527452125LL,  7},     {1, 1200340205LL,  6},
+    {2,  485518043LL,  7},     {2,  464823301LL,  7},     {1, 2369637129LL,  7},
+    {2,  423966729LL,  7},     {2,  403800345LL,  7},     {2,  383805589LL,  7},
+    {1,  582368447LL,  5},     {2,  344322273LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {2,  248469183LL,  7},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {2,  192835267LL,  7},     {2,  174592167LL,  7},     {2,  156496785LL,  7},
+    {2,  138547333LL,  7},     {2,  120742053LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {2,   68174085LL,  7},     {1, 2172947881LL,  7},
+    {2,   33818641LL,  7},     {1, 2155905153LL,  7},     {0,          0LL,  8},
+};
+static const uniform int64 __idiv_table_s32[][3] = {
+    {0,          0LL,  1},     {1, 1431655766LL,  0},     {0,          0LL,  2},
+    {1, 1717986919LL,  1},     {1,  715827883LL,  0},     {1, 2454267027LL,  2},
+    {0,          0LL,  3},     {1,  954437177LL,  1},     {1, 1717986919LL,  2},
+    {1,  780903145LL,  1},     {1,  715827883LL,  1},     {1, 1321528399LL,  2},
+    {1, 2454267027LL,  3},     {1, 2290649225LL,  3},     {0,          0LL,  4},
+    {1, 2021161081LL,  3},     {1,  954437177LL,  2},     {1, 1808407283LL,  3},
+    {1, 1717986919LL,  3},     {1,  818089009LL,  2},     {1,  780903145LL,  2},
+    {1, 2987803337LL,  4},     {1,  715827883LL,  2},     {1, 1374389535LL,  3},
+    {1, 1321528399LL,  3},     {1, 1272582903LL,  3},     {1, 2454267027LL,  4},
+    {1, 2369637129LL,  4},     {1, 2290649225LL,  4},     {1, 2216757315LL,  4},
+    {0,          0LL,  5},     {1, 1041204193LL,  3},     {1, 2021161081LL,  4},
+    {1, 3926827243LL,  5},     {1,  954437177LL,  3},     {1, 3714566311LL,  5},
+    {1, 1808407283LL,  4},     {1, 3524075731LL,  5},     {1, 1717986919LL,  4},
+    {1, 1676084799LL,  4},     {1,  818089009LL,  3},     {1,  799063683LL,  3},
+    {1,  780903145LL,  3},     {1, 3054198967LL,  5},     {1, 2987803337LL,  5},
+    {1, 2924233053LL,  5},     {1,  715827883LL,  3},     {1, 1402438301LL,  4},
+    {1, 1374389535LL,  4},     {1, 2694881441LL,  5},     {1, 1321528399LL,  4},
+    {1, 1296593901LL,  4},     {1, 1272582903LL,  4},     {1,  156180629LL,  1},
+    {1, 2454267027LL,  5},     {1, 2411209711LL,  5},     {1, 2369637129LL,  5},
+    {1,  582368447LL,  3},     {1, 2290649225LL,  5},     {1, 1126548799LL,  4},
+    {1, 2216757315LL,  5},     {1, 2181570691LL,  5},     {0,          0LL,  6},
+    {1, 2114445439LL,  5},     {1, 1041204193LL,  4},     {1,  128207979LL,  1},
+    {1, 2021161081LL,  5},     {1, 1991868891LL,  5},     {1, 3926827243LL,  6},
+    {1, 3871519817LL,  6},     {1,  954437177LL,  4},     {1, 3765450781LL,  6},
+    {1, 3714566311LL,  6},     {1,  458129845LL,  3},     {1, 1808407283LL,  5},
+    {1,  892460737LL,  4},     {1, 3524075731LL,  6},     {1, 1739733589LL,  5},
+    {1, 1717986919LL,  5},     {1,  424194301LL,  3},     {1, 1676084799LL,  5},
+    {1,  827945503LL,  4},     {1,  818089009LL,  4},     {1, 1616928865LL,  5},
+    {1,  799063683LL,  4},     {1,  789879043LL,  4},     {1,  780903145LL,  4},
+    {1, 3088515809LL,  6},     {1, 3054198967LL,  6},     {1, 3020636341LL,  6},
+    {1, 2987803337LL,  6},     {1,  738919105LL,  4},     {1, 2924233053LL,  6},
+    {1, 2893451653LL,  6},     {1,  715827883LL,  4},     {1,  354224107LL,  3},
+    {1, 1402438301LL,  5},     {1, 2776544515LL,  6},     {1, 1374389535LL,  5},
+    {1,  680390859LL,  4},     {1, 2694881441LL,  6},     {1,  333589693LL,  3},
+    {1, 1321528399LL,  5},     {1, 2617884829LL,  6},     {1, 1296593901LL,  5},
+    {1, 1284476201LL,  5},     {1, 1272582903LL,  5},     {1, 2521815661LL,  6},
+    {1,  156180629LL,  2},     {1, 2476377541LL,  6},     {1, 2454267027LL,  6},
+    {1, 1216273925LL,  5},     {1, 2411209711LL,  6},     {1, 1195121335LL,  5},
+    {1, 2369637129LL,  6},     {1, 2349383821LL,  6},     {1,  582368447LL,  4},
+    {1, 1154949189LL,  5},     {1, 2290649225LL,  6},     {1,   70991195LL,  1},
+    {1, 1126548799LL,  5},     {1,  558694933LL,  4},     {1, 2216757315LL,  6},
+    {1,  274877907LL,  3},     {1, 2181570691LL,  6},     {1, 2164392969LL,  6},
+    {0,          0LL,  7},     {1,  266354561LL,  3},     {1, 2114445439LL,  6},
+    {1, 1049152317LL,  5},     {1, 1041204193LL,  5},     {1, 4133502361LL,  7},
+    {1,  128207979LL,  2},     {1, 4072265289LL,  7},     {1, 2021161081LL,  6},
+    {1,  125400505LL,  2},     {1, 1991868891LL,  6},     {1, 1977538899LL,  6},
+    {1, 3926827243LL,  7},     {1,  974744351LL,  5},     {1, 3871519817LL,  7},
+    {1,  961111563LL,  5},     {1,  954437177LL,  5},     {1, 3791419407LL,  7},
+    {1, 3765450781LL,  7},     {1, 1869917735LL,  6},     {1, 3714566311LL,  7},
+    {1,  230602271LL,  3},     {1,  458129845LL,  4},     {1,  910191745LL,  5},
+    {1, 1808407283LL,  6},     {1, 3593175255LL,  7},     {1,  892460737LL,  5},
+    {1,  443351463LL,  4},     {1, 3524075731LL,  7},     {1,  875407347LL,  5},
+    {1, 1739733589LL,  6},     {1,  432197967LL,  4},     {1, 1717986919LL,  6},
+    {1, 3414632385LL,  7},     {1,  424194301LL,  4},     {1,  210795941LL,  3},
+    {1, 1676084799LL,  6},     {1, 1665926709LL,  6},     {1,  827945503LL,  5},
+    {1, 1645975491LL,  6},     {1,  818089009LL,  5},     {1, 1626496491LL,  6},
+    {1, 1616928865LL,  6},     {1, 3214946281LL,  7},     {1,  799063683LL,  5},
+    {1,  397222409LL,  4},     {1,  789879043LL,  5},     {1, 1570730897LL,  6},
+    {1,  780903145LL,  5},     {1, 3105965051LL,  7},     {1, 3088515809LL,  7},
+    {1, 3071261531LL,  7},     {1, 3054198967LL,  7},     {1,  759331235LL,  5},
+    {1, 3020636341LL,  7},     {1, 3004130131LL,  7},     {1, 2987803337LL,  7},
+    {1, 2971653049LL,  7},     {1,  738919105LL,  5},     {1, 2939870663LL,  7},
+    {1, 2924233053LL,  7},     {1, 2908760921LL,  7},     {1, 2893451653LL,  7},
+    {1, 2878302691LL,  7},     {1,  715827883LL,  5},     {1,  356059465LL,  4},
+    {1,  354224107LL,  4},     {1, 2819260585LL,  7},     {1, 1402438301LL,  6},
+    {1, 1395319325LL,  6},     {1, 2776544515LL,  7},     {1, 1381296015LL,  6},
+    {1, 1374389535LL,  6},     {1,   42735993LL,  1},     {1,  680390859LL,  5},
+    {1, 2708156719LL,  7},     {1, 2694881441LL,  7},     {1, 1340867839LL,  6},
+    {1,  333589693LL,  4},     {1,  663956297LL,  5},     {1, 1321528399LL,  6},
+    {1, 2630410593LL,  7},     {1, 2617884829LL,  7},     {1,   81421181LL,  2},
+    {1, 1296593901LL,  6},     {1, 2581013211LL,  7},     {1, 1284476201LL,  6},
+    {1, 1278501893LL,  6},     {1, 1272582903LL,  6},     {1, 2533436931LL,  7},
+    {1, 2521815661LL,  7},     {1, 2510300521LL,  7},     {1,  156180629LL,  3},
+    {1, 2487582869LL,  7},     {1, 2476377541LL,  7},     {1, 2465272709LL,  7},
+    {1, 2454267027LL,  7},     {1, 2443359173LL,  7},     {1, 1216273925LL,  6},
+    {1,  605457945LL,  5},     {1, 2411209711LL,  7},     {1, 1200340205LL,  6},
+    {1, 1195121335LL,  6},     {1, 2379895299LL,  7},     {1, 2369637129LL,  7},
+    {1, 2359467013LL,  7},     {1, 2349383821LL,  7},     {1, 2339386443LL,  7},
+    {1,  582368447LL,  5},     {1, 2319644785LL,  7},     {1, 1154949189LL,  6},
+    {1, 2300233531LL,  7},     {1, 2290649225LL,  7},     {1,  285143057LL,  4},
+    {1,   70991195LL,  2},     {1, 2262369605LL,  7},     {1, 1126548799LL,  6},
+    {1, 1121950641LL,  6},     {1,  558694933LL,  5},     {1, 2225732041LL,  7},
+    {1, 2216757315LL,  7},     {1, 2207854675LL,  7},     {1,  274877907LL,  4},
+    {1, 2190262207LL,  7},     {1, 2181570691LL,  7},     {1, 2172947881LL,  7},
+    {1, 2164392969LL,  7},     {1, 2155905153LL,  7},     {1, 2147483649LL,  7},
+};
+
+__declspec(safe)
+static unmasked inline unsigned int8
+__fast_idiv(unsigned int8 numerator, uniform unsigned int8 divisor) {
+  uniform int64 method = __idiv_table_u8[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u8[divisor-2][1];
+  uniform int64 shift = __idiv_table_u8[divisor-2][2];
+
+  unsigned int16 mult = multiplier;
+  unsigned int16 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (8 + shift);
+  else {
+      val *= mult;
+      val >>= 8;
+      val += (numerator-val)>>1;
+      return (val >> shift);
+  }
+}
+
+__declspec(safe)
+static unmasked inline int8 __fast_idiv(int8 numerator, uniform int8 divisor) {
+  uniform int8 method = __idiv_table_s8[divisor-2][0];
+  uniform int16 multiplier = __idiv_table_s8[divisor-2][1];
+  uniform int8 shift = __idiv_table_s8[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int8 sign = numerator >> 7;
+      numerator ^= sign;
+      int16 mul = (int16)numerator * (int16)multiplier;
+      mul >>= 8 + shift;
+      return (int8)mul ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 __fast_idiv(unsigned int16 numerator,
+                                                  uniform unsigned int16 divisor) {
+  uniform int64 method = __idiv_table_u16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u16[divisor-2][1];
+  uniform int64 shift = __idiv_table_u16[divisor-2][2];
+
+  unsigned int32 mult = multiplier;
+  unsigned int32 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (16 + shift);
+  else {
+      val *= mult;
+      val >>= 16;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked inline int16 __fast_idiv(int16 numerator, uniform int16 divisor) {
+  uniform int64 method = __idiv_table_s16[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s16[divisor-2][1];
+  uniform int64 shift = __idiv_table_s16[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int16 sign = numerator >> 15;
+      numerator ^= sign;
+      int32 mul = (int32)numerator * (int32)multiplier;
+      mul >>= 16 + shift;
+      int16 result = mul;
+      return result ^ sign;
+  }
+}
+
+__declspec(safe)
+static unmasked inline inline unsigned int32 __fast_idiv(unsigned int32 numerator,
+                                                         uniform unsigned int32 divisor) {
+  uniform int64 method = __idiv_table_u32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_u32[divisor-2][1];
+  uniform int64 shift = __idiv_table_u32[divisor-2][2];
+
+  unsigned int64 mult = multiplier;
+  unsigned int64 val = numerator;
+  if (method == 0)
+      return numerator >> shift;
+  else if (method == 1)
+      return (val * mult) >> (32 + shift);
+  else {
+      val *= mult;
+      val >>= 32;
+      val += (numerator-val)>>1;
+      return val >> shift;
+  }
+}
+
+__declspec(safe)
+static unmasked inline int32 __fast_idiv(int32 numerator, uniform int32 divisor) {
+  uniform int64 method = __idiv_table_s32[divisor-2][0];
+  uniform int64 multiplier = __idiv_table_s32[divisor-2][1];
+  uniform int64 shift = __idiv_table_s32[divisor-2][2];
+
+  if (method == 0)
+      return numerator >> shift;
+  else {
+      unsigned int32 sign = numerator >> 31;
+      numerator ^= sign;
+      int64 mul = (int64)numerator * (int64)multiplier;
+      mul >>= 32 + shift;
+      int32 result = mul;
+      return result ^ sign;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Saturating int8/int16 ops
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_up(unsigned int8 a, unsigned int8 b) {
+    return __avg_up_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_up(int8 a, int8 b) {
+    return __avg_up_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_up(unsigned int16 a, unsigned int16 b) {
+    return __avg_up_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_up(int16 a, int16 b) {
+    return __avg_up_int16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int8 avg_down(unsigned int8 a, unsigned int8 b) {
+    return __avg_down_uint8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int8 avg_down(int8 a, int8 b) {
+    return __avg_down_int8(a, b);
+}
+
+__declspec(safe)
+static unmasked inline unsigned int16 avg_down(unsigned int16 a, unsigned int16 b) {
+    return __avg_down_uint16(a, b);
+}
+
+__declspec(safe)
+static unmasked inline int16 avg_down(int16 a, int16 b) {
+    return __avg_down_int16(a, b);
+}
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 59964d6d..32d3bcba 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index 9ff82226..df8eae5c 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index eb4fed3a..d6346455 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 3
-#define maxProgramCount 64
+#define width 3ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 3;
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index b64cd10b..7c177fde 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_v(uniform float RET[]) {
-#define width 4
-#define maxProgramCount 64
+#define width 4ul
+#define maxProgramCount 64ul
     assert(programCount <= maxProgramCount);
 
 //CO    const uniform int width = 4;
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index c27ad99c..d6359555 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 30 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(30, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 86faaddb..dea3bfc3 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 32 && programIndex & 1)
-        b = atomic_or_global(&s, (1 << programIndex));
+        b = atomic_or_global(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 30b343d1..ac746ad2 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,10 +5,10 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(&s, (1<<min(programIndex,30)));
+    float b = atomic_or_global(&s, (1ul<<min(programIndex,30)));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(programCount,31))-1;
+    RET[programIndex] = (1ul<<min(programCount,31))-1;
 }
diff --git a/tests/avg-down-int16.ispc b/tests/avg-down-int16.ispc
new file mode 100644
index 00000000..10a3c2a2
--- /dev/null
+++ b/tests/avg-down-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-int8.ispc b/tests/avg-down-int8.ispc
new file mode 100644
index 00000000..67638934
--- /dev/null
+++ b/tests/avg-down-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint16.ispc b/tests/avg-down-uint16.ispc
new file mode 100644
index 00000000..70f9185e
--- /dev/null
+++ b/tests/avg-down-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-down-uint8.ispc b/tests/avg-down-uint8.ispc
new file mode 100644
index 00000000..75fbf116
--- /dev/null
+++ b/tests/avg-down-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_down(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5) / 2;
+}
diff --git a/tests/avg-up-int16.ispc b/tests/avg-up-int16.ispc
new file mode 100644
index 00000000..8f557a5b
--- /dev/null
+++ b/tests/avg-up-int16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int16 a = aFOO[programIndex];
+    int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-int8.ispc b/tests/avg-up-int8.ispc
new file mode 100644
index 00000000..d0a3b444
--- /dev/null
+++ b/tests/avg-up-int8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    int8 a = aFOO[programIndex];
+    int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint16.ispc b/tests/avg-up-uint16.ispc
new file mode 100644
index 00000000..273f9f3b
--- /dev/null
+++ b/tests/avg-up-uint16.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int16 a = aFOO[programIndex];
+    unsigned int16 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/avg-up-uint8.ispc b/tests/avg-up-uint8.ispc
new file mode 100644
index 00000000..d5d02491
--- /dev/null
+++ b/tests/avg-up-uint8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bf) {
+    unsigned int8 a = aFOO[programIndex];
+    unsigned int8 b = bf;
+    RET[programIndex] = avg_up(a, b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = ((int)programIndex + 1 + 5 + 1) / 2;
+}
diff --git a/tests/clock.ispc b/tests/clock.ispc
new file mode 100644
index 00000000..0e95379b
--- /dev/null
+++ b/tests/clock.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  unsigned uniform int64 a = clock();
+  float x = pow(sqrt(aFOO[programIndex]), 5.5);
+  unsigned uniform int64 b = clock();
+  RET[programIndex] = (b - a) > 0 ? 1 : 0;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 1;
+}
diff --git a/tests/coalesce-1.ispc b/tests/coalesce-1.ispc
index acfe8cdf..39a79a91 100644
--- a/tests/coalesce-1.ispc
+++ b/tests/coalesce-1.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
     
     assert(programIndex <= 64);
diff --git a/tests/coalesce-2.ispc b/tests/coalesce-2.ispc
index 88b952a4..a047e456 100644
--- a/tests/coalesce-2.ispc
+++ b/tests/coalesce-2.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[programIndex & 1];
diff --git a/tests/coalesce-3.ispc b/tests/coalesce-3.ispc
index 7a05963f..c1718b4f 100644
--- a/tests/coalesce-3.ispc
+++ b/tests/coalesce-3.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     RET[programIndex] = buf[(programIndex >> 2) * 16 + (programIndex & 3)];
diff --git a/tests/coalesce-4.ispc b/tests/coalesce-4.ispc
index 1ddd4b89..182a4d4f 100644
--- a/tests/coalesce-4.ispc
+++ b/tests/coalesce-4.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[2*programIndex];
diff --git a/tests/coalesce-5.ispc b/tests/coalesce-5.ispc
index 2dd8d44e..385e8526 100644
--- a/tests/coalesce-5.ispc
+++ b/tests/coalesce-5.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-6.ispc b/tests/coalesce-6.ispc
index 2a54a2db..8c630a45 100644
--- a/tests/coalesce-6.ispc
+++ b/tests/coalesce-6.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-7.ispc b/tests/coalesce-7.ispc
index 8ed628bd..29b56b8d 100644
--- a/tests/coalesce-7.ispc
+++ b/tests/coalesce-7.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     float a = buf[4*programIndex];
diff --git a/tests/coalesce-8.ispc b/tests/coalesce-8.ispc
index dfefaa19..f01ca9c3 100644
--- a/tests/coalesce-8.ispc
+++ b/tests/coalesce-8.ispc
@@ -2,8 +2,8 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform float * uniform buf = uniform new uniform float[32*32];
-    for (uniform int i = 0; i < 32*32; ++i)
+    uniform float * uniform buf = uniform new uniform float[32l*32l];
+    for (uniform int i = 0; i < 32l*32l; ++i)
         buf[i] = i;
 
     int index = (programIndex < 4) ? (programIndex & 1) :
diff --git a/tests/count-leading-trailing-zeros-1.ispc b/tests/count-leading-trailing-zeros-1.ispc
index 221d066d..3f12c07d 100644
--- a/tests/count-leading-trailing-zeros-1.ispc
+++ b/tests/count-leading-trailing-zeros-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = count_trailing_zeros(0xf0);
+    RET[programIndex] = count_trailing_zeros(0xf0ul);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/count-leading-trailing-zeros-4.ispc b/tests/count-leading-trailing-zeros-4.ispc
index 475c18ca..4b849018 100644
--- a/tests/count-leading-trailing-zeros-4.ispc
+++ b/tests/count-leading-trailing-zeros-4.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    int32 i = (1 << (programIndex % 28));
+    int32 i = (1ul << (programIndex % 28));
     RET[programIndex] = count_leading_zeros(i);
 }
 
diff --git a/tests/exclusive-scan-and-2.ispc b/tests/exclusive-scan-and-2.ispc
index 5d2bcd1f..b742a91e 100644
--- a/tests/exclusive-scan-and-2.ispc
+++ b/tests/exclusive-scan-and-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = ~(1 << programIndex);
+    int32 a = ~(1ul << programIndex);
     if ((programIndex < 32) && (programIndex & 1) == 0) {
         RET[programIndex] = exclusive_scan_and(a);
     }
@@ -15,7 +15,7 @@ export void result(uniform float RET[]) {
     if ((programIndex & 1) == 0 && programIndex > 0 && programIndex < 32) {
         int val = 0xffffffff;
         for (int i = 0; i < programIndex-1; i += 2)
-            val &= ~(1<<i);
+            val &= ~(1ul<<i);
         RET[programIndex] = val;
     }
 }
diff --git a/tests/exclusive-scan-or-1.ispc b/tests/exclusive-scan-or-1.ispc
index bd2b7598..ce790c4f 100644
--- a/tests/exclusive-scan-or-1.ispc
+++ b/tests/exclusive-scan-or-1.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RET[programIndex] = -1;
-    int32 a = (1 << (min(programIndex, 30)));
+    int32 a = (1ul << (min(programIndex, 30)));
     RET[programIndex] = exclusive_scan_or(a);
 }
 
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1 << (min(programIndex, 31))) - 1;
+    RET[programIndex] = (1ul << (min(programIndex, 31))) - 1;
 }
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index 6c38b05e..dbb4128b 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex % 28)) * 1.5;
+    double a = (1ul<< (programIndex % 28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index ba4831d7..f397355f 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = (1<< (programIndex%28)) * 1.5;
+    double a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index 7d5fc1d2..9df35c4c 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index ec54e4be..2bfa35ff 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = (1<< (programIndex%28)) * 1.5;
+    float a = (1ul << (programIndex%28)) * 1.5;
     if (programIndex & 1)
         a = -a;
     int exponent;
diff --git a/tests/idiv.ispc b/tests/idiv.ispc
new file mode 100644
index 00000000..bd0766da
--- /dev/null
+++ b/tests/idiv.ispc
@@ -0,0 +1,81 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+  uniform int errorCount = 0;
+
+  for (unsigned int8 num = 0; num < 255; ++num) {
+    for (uniform unsigned int8 div = 2; div < 255; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 32) break;
+      }
+    }
+  }
+
+  for (int8 num = 0; num < 127; ++num) {
+    for (uniform int8 div = 2; div < 127; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 64) break;
+      }
+    }
+  }
+
+  for (int16 num = 0; num < 32767; ++num) {
+    for (uniform int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 96) break;
+      }
+    }
+  }
+
+  for (unsigned int16 num = 0; num < 0xffff; ++num) {
+    for (uniform unsigned int16 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 128) break;
+      }
+    }
+  }
+
+  // randomly sample int32s...
+  uniform RNGState state;
+  seed_rng(&state, 1234);
+  for (uniform int i = 0; i < 64k; ++i) {
+    unsigned int32 num = random(&state);
+    for (uniform unsigned int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("ui32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 160) break;
+      }
+    }
+  }
+
+  for (uniform int64 i = 0; i < 64k; ++i) {
+    int32 num = random(&state);
+    if (num < 0)
+      continue;
+    for (uniform int32 div = 2; div < 256; ++div) {
+      if (__fast_idiv(num, div) != num/div) {
+        ++errorCount;
+        print("si32 error %/% = %, got %\n", num, div, num/div, __fast_idiv(num,div));
+        if (errorCount > 192) break;
+      }
+    }
+  }
+
+  RET[programIndex] = errorCount;
+}
+
+export void result(uniform float RET[]) {
+  RET[programIndex] = 0;
+}
+
diff --git a/tests/kilo-mega-giga-2.ispc b/tests/kilo-mega-giga-2.ispc
index 77e201ef..42545b8d 100644
--- a/tests/kilo-mega-giga-2.ispc
+++ b/tests/kilo-mega-giga-2.ispc
@@ -8,5 +8,5 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2*1024*1024 + 5;
+    RET[programIndex] = 2ul*1024ul*1024ul + 5;
 }
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
index 6b3ed734..e1b7a59f 100644
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    double a = 1 << (programIndex % 28);
+    double a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
index a2ec9a27..305ae106 100644
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    float a = 1 << (programIndex % 28);
+    float a = 1ul << (programIndex % 28);
     if (programIndex & 1)
         a = -a;
     RET[programIndex] = ldexp(a, 2);
@@ -11,7 +11,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
 
 export void result(uniform float RET[]) {
     int pi = programIndex % 28;
-    RET[programIndex] = (1 << (pi + 2));
+    RET[programIndex] = (1ul << (pi + 2));
     if (programIndex & 1)
         RET[programIndex] = -RET[programIndex];
 }
diff --git a/tests/local-atomics-12.ispc b/tests/local-atomics-12.ispc
index 23a30af5..358ffd34 100644
--- a/tests/local-atomics-12.ispc
+++ b/tests/local-atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = s;
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(programCount, 29); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = sum;
 }
diff --git a/tests/local-atomics-13.ispc b/tests/local-atomics-13.ispc
index 36fd1f1c..b9d35d09 100644
--- a/tests/local-atomics-13.ispc
+++ b/tests/local-atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex < 28 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = popcnt(reduce_max(b));
 }
 
diff --git a/tests/local-atomics-14.ispc b/tests/local-atomics-14.ispc
index 4cf81809..25c52e60 100644
--- a/tests/local-atomics-14.ispc
+++ b/tests/local-atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 32 && (programIndex & 1))
-        b = atomic_or_local(&s, (1 << programIndex));
+        b = atomic_or_local(&s, (1ul << programIndex));
     RET[programIndex] = (s>>20);
 }
 
@@ -15,6 +15,6 @@ export void result(uniform float RET[]) {
     uniform int sum = 0;
     for (uniform int i = 0; i < min(32, programCount); ++i)
         if (i & 1)
-            sum += (1 << i);
+            sum += (1ul << i);
     RET[programIndex] = ((unsigned int64)(0xffffffffff000000 | sum)) >> 20;
 }
diff --git a/tests/local-atomics-4.ispc b/tests/local-atomics-4.ispc
index f7f6a04a..b3648ab5 100644
--- a/tests/local-atomics-4.ispc
+++ b/tests/local-atomics-4.ispc
@@ -7,10 +7,10 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex < 29)
-        atomic_or_local(&s, (1<<programIndex));
+        atomic_or_local(&s, (1ul<<programIndex));
     RET[programIndex] = s;
 }
 
 export void result(uniform float RET[]) {
-    RET[programIndex] = (1<<min(29,programCount))-1;
+    RET[programIndex] = (1ul<<min(29,programCount))-1;
 }
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index 3a23a917..8869d600 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -11,7 +11,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     for (uniform int i = 0; i < iters; ++i) {
         unsigned int val = random(&state);
         for (uniform int j = 0; j < 32; ++j) {
-            if (val & (1<<j))
+            if (val & (1ul<<j))
                 ++count[j];
         }
     }
diff --git a/tests/reduce-add-int16-1.ispc b/tests/reduce-add-int16-1.ispc
new file mode 100644
index 00000000..58529ca1
--- /dev/null
+++ b/tests/reduce-add-int16-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int16.ispc b/tests/reduce-add-int16.ispc
new file mode 100644
index 00000000..8657b201
--- /dev/null
+++ b/tests/reduce-add-int16.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int16 iv = (int)v;
+/*CO    if (iv & 1)*/
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8-1.ispc b/tests/reduce-add-int8-1.ispc
new file mode 100644
index 00000000..e5310aae
--- /dev/null
+++ b/tests/reduce-add-int8-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int8 iv = (int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; i += 2)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int8.ispc b/tests/reduce-add-int8.ispc
new file mode 100644
index 00000000..7e0dd027
--- /dev/null
+++ b/tests/reduce-add-int8.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+  int8 db = b-4;
+  int8 iv = programIndex + db;
+  int m = reduce_add(iv);
+  RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = 0;
+    for (uniform int i = 1; i <= programCount; ++i)
+        x += i;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/sizeof-9.ispc b/tests/sizeof-9.ispc
index ad96fab2..84492bcc 100644
--- a/tests/sizeof-9.ispc
+++ b/tests/sizeof-9.ispc
@@ -2,7 +2,7 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    RET[programIndex] = sizeof 1;
+  RET[programIndex] = sizeof 1u;
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-83.ispc b/tests/test-83.ispc
index eaca92d7..6aa408b7 100644
--- a/tests/test-83.ispc
+++ b/tests/test-83.ispc
@@ -6,7 +6,7 @@ float f(int i) { return i + 1.; }
 float f(float v) { return 2 * v; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-84.ispc b/tests/test-84.ispc
index f39568b0..fedf149a 100644
--- a/tests/test-84.ispc
+++ b/tests/test-84.ispc
@@ -6,7 +6,7 @@ float f(float v) { return 2 * v; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f(10); 
+    RET[programIndex] = f(a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/test-85.ispc b/tests/test-85.ispc
index 0001816c..e9510ffb 100644
--- a/tests/test-85.ispc
+++ b/tests/test-85.ispc
@@ -8,7 +8,7 @@ float f(float a, int b) { return a + b; }
 float f(int i) { return i + 1.; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    RET[programIndex] = f(a) + f() + f(a, a) + f(10); 
+    RET[programIndex] = f(a) + f() + f(a, a) + f(10l);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests_errors/array-plus-equals.ispc b/tests_errors/array-plus-equals.ispc
index 0e0ba744..7fcecc43 100644
--- a/tests_errors/array-plus-equals.ispc
+++ b/tests_errors/array-plus-equals.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/array-pointer-assign.ispc b/tests_errors/array-pointer-assign.ispc
index d709dbd3..3e74c2ed 100644
--- a/tests_errors/array-pointer-assign.ispc
+++ b/tests_errors/array-pointer-assign.ispc
@@ -1,4 +1,4 @@
-// Illegal to assign to array type "varying float[5]"
+// Illegal to assign to array type "varying float\[5\]"
 
 void foo(float *x) {
     float a[5] = { 1,2,3,4,5};
diff --git a/tests_errors/float-logical.ispc b/tests_errors/float-logical.ispc
index 27ab4c8c..a5a1a5cc 100644
--- a/tests_errors/float-logical.ispc
+++ b/tests_errors/float-logical.ispc
@@ -1,4 +1,4 @@
-// Illegal to use ^= operator with floating-point
+// Illegal to use \^= operator with floating-point
 
 float foo(float a, float b) {
     return a ^= b;
diff --git a/tests_errors/fptr-typecheck-2.ispc b/tests_errors/fptr-typecheck-2.ispc
index ea59fa54..6a665207 100644
--- a/tests_errors/fptr-typecheck-2.ispc
+++ b/tests_errors/fptr-typecheck-2.ispc
@@ -1,4 +1,4 @@
-// Can't convert argument of type "void * uniform" to type "varying float" for function call argument.
+// Can't convert argument of type "void \* uniform" to type "varying float" for function call argument.
 
 float bar(float a, float b);
 
diff --git a/tests_errors/fptr-typecheck-3.ispc b/tests_errors/fptr-typecheck-3.ispc
index 26412632..a65ac74e 100644
--- a/tests_errors/fptr-typecheck-3.ispc
+++ b/tests_errors/fptr-typecheck-3.ispc
@@ -1,4 +1,4 @@
-// Too few parameter values provided in function call (1 provided, 2 expected).
+// Too few parameter values provided in function call \(1 provided, 2 expected\).
 
 float bar(float a, float b);
 
diff --git a/tests_errors/initexpr-2.ispc b/tests_errors/initexpr-2.ispc
index 681fe6fc..db0b4925 100644
--- a/tests_errors/initexpr-2.ispc
+++ b/tests_errors/initexpr-2.ispc
@@ -1,3 +1,3 @@
-// Initializer list for array "varying int32[2][4]" must have no more than 2 elements (has 3)
+// Initializer list for array "varying int32\[2\]\[4\]" must have no more than 2 elements \(has 3\)
 
 int a[2][4] = { { 1, 2, 3 }, { 1, 2, 3, 4 }, 1 };
diff --git a/tests_errors/int-ptr-fail.ispc b/tests_errors/int-ptr-fail.ispc
index 0c06bfc8..4185651e 100644
--- a/tests_errors/int-ptr-fail.ispc
+++ b/tests_errors/int-ptr-fail.ispc
@@ -1,5 +1,5 @@
-// Type conversion from "const uniform int32" to "uniform int32 * varying" for initializer is not possible
+// Type conversion from "const uniform int32" to "uniform int32 \* varying" for initializer is not possible
 
 int voo() {
-    int * varying foo = 1;
+    int * varying foo = 1l;
 }
diff --git a/tests_errors/lvalue-2.ispc b/tests_errors/lvalue-2.ispc
index ae2e3edd..77438ebb 100644
--- a/tests_errors/lvalue-2.ispc
+++ b/tests_errors/lvalue-2.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     4 = 0;
diff --git a/tests_errors/lvalue-3.ispc b/tests_errors/lvalue-3.ispc
index eb856bf0..72b8b9b8 100644
--- a/tests_errors/lvalue-3.ispc
+++ b/tests_errors/lvalue-3.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32" on left-hand side of expression
+// Can't assign to type "const uniform int[0-9]*" on left-hand side of expression
 
 int bar(){ 
     int x;
diff --git a/tests_errors/new-delete-3.ispc b/tests_errors/new-delete-3.ispc
index bb22aa56..f46053e3 100644
--- a/tests_errors/new-delete-3.ispc
+++ b/tests_errors/new-delete-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '('
+// syntax error, unexpected '\('
 
 int * func(int a) {
     return new int[a](10);
diff --git a/tests_errors/new-delete-6.ispc b/tests_errors/new-delete-6.ispc
index 250441c2..1b090a1c 100644
--- a/tests_errors/new-delete-6.ispc
+++ b/tests_errors/new-delete-6.ispc
@@ -1,4 +1,4 @@
-// Can't convert from type "uniform int32 * varying" to type "uniform int32 * uniform" for return
+// Can't convert from type "uniform int32 \* varying" to type "uniform int32 \* uniform" for return
 
 int * uniform func(int x) {
     return new int[x];
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
index 97a88488..5a9d891c 100644
--- a/tests_errors/ptr-1.ispc
+++ b/tests_errors/ptr-1.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer type "void * varying" to incompatible pointer type "uniform int32 * varying" for return statement
+// Can't convert from pointer type "void \* varying" to incompatible pointer type "uniform int32 \* varying" for return statement
 
 int *foo(void *p) {
     return p;
diff --git a/tests_errors/ptr-const-1.ispc b/tests_errors/ptr-const-1.ispc
index 4dcfaa75..65900060 100644
--- a/tests_errors/ptr-const-1.ispc
+++ b/tests_errors/ptr-const-1.ispc
@@ -1,4 +1,4 @@
-// Can't assign to type "const uniform int32 * const varying"
+// Can't assign to type "const uniform int32 \* const varying"
 
 void foo(const int * const p) {
     ++p;
diff --git a/tests_errors/ptrcast-lose-info.ispc b/tests_errors/ptrcast-lose-info.ispc
index 5da374aa..61efe95e 100644
--- a/tests_errors/ptrcast-lose-info.ispc
+++ b/tests_errors/ptrcast-lose-info.ispc
@@ -1,4 +1,4 @@
-// Pointer type cast of type "uniform int32 * uniform" to integer type "uniform int32" may lose information.
+// Pointer type cast of type "uniform int32 \* uniform" to integer type "uniform int32" may lose information.
 //  rule: run on arch=x86-64
 
 int32 foo(int * uniform x)  {
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
index 85a8dd35..11b30a92 100644
--- a/tests_errors/ref-3.ispc
+++ b/tests_errors/ref-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '*',
+// syntax error, unexpected '\*',
 
 void foo(int & * x) {
     *x = NULL;
diff --git a/tests_errors/soa-11.ispc b/tests_errors/soa-11.ispc
index 67d814c6..d632e9b0 100644
--- a/tests_errors/soa-11.ispc
+++ b/tests_errors/soa-11.ispc
@@ -1,4 +1,4 @@
-// Type conversion from "const uniform int32" to "soa<4> struct Foo" for assignment operator is not possible
+// Type conversion from "const uniform int[0-9]*" to "soa<4> struct Foo" for assignment operator is not possible
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-12.ispc b/tests_errors/soa-12.ispc
index e2cd3242..c0420614 100644
--- a/tests_errors/soa-12.ispc
+++ b/tests_errors/soa-12.ispc
@@ -1,4 +1,4 @@
-// Can't convert between types "const uniform int32" and "soa<4> float" with different SOA widths
+// Can't convert between types "const uniform int[0-9]*" and "soa<4> float" with different SOA widths
 
 struct Pt { float x, y, z; };
 
diff --git a/tests_errors/soa-3.ispc b/tests_errors/soa-3.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-3.ispc
+++ b/tests_errors/soa-3.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-4.ispc b/tests_errors/soa-4.ispc
index b2be1b59..04dc84bc 100644
--- a/tests_errors/soa-4.ispc
+++ b/tests_errors/soa-4.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected '-', expecting int32 constant
+// syntax error, unexpected '-', expecting int
 
 struct F { float a, b, c; };
 
diff --git a/tests_errors/soa-9.ispc b/tests_errors/soa-9.ispc
index 7c6a1df9..e9e7509a 100644
--- a/tests_errors/soa-9.ispc
+++ b/tests_errors/soa-9.ispc
@@ -1,4 +1,4 @@
-// Can't convert from pointer to SOA type "soa<8> struct A * uniform" to pointer to non-SOA type "void * varying" 
+// Can't convert from pointer to SOA type "soa<8> struct A \* uniform" to pointer to non-SOA type "void \* varying" 
 
 struct A { float a, b; };
 
diff --git a/tests_errors/struct_arith.ispc b/tests_errors/struct_arith.ispc
index 9d942880..df729d02 100644
--- a/tests_errors/struct_arith.ispc
+++ b/tests_errors/struct_arith.ispc
@@ -1,4 +1,4 @@
-// Assignment operator "+=" is illegal with struct type
+// Assignment operator "\+=" is illegal with struct type
 
 struct Point { float x, y, z; };
 
diff --git a/tests_errors/vec-size-compile-constant.ispc b/tests_errors/vec-size-compile-constant.ispc
index b9e61721..0eb6f90e 100644
--- a/tests_errors/vec-size-compile-constant.ispc
+++ b/tests_errors/vec-size-compile-constant.ispc
@@ -1,4 +1,4 @@
-// syntax error, unexpected identifier, expecting int32 constant
+// syntax error, unexpected identifier, expecting int
 
 void foo(uniform int i) {
     float<i> a;
diff --git a/util.cpp b/util.cpp
index dbea9517..6b121988 100644
--- a/util.cpp
+++ b/util.cpp
@@ -79,8 +79,8 @@
     compiler under a debuffer; in this case, just return a reasonable
     default.
  */
-static int
-lTerminalWidth() {
+int
+TerminalWidth() {
     if (g->disableLineWrap)
         return 1<<30;
 
@@ -228,8 +228,8 @@ lFindIndent(int numColons, const char *buf) {
 /** Print the given string to the given FILE, assuming the given output
     column width.  Break words as needed to avoid words spilling past the
     last column.  */
-static void
-lPrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
+void
+PrintWithWordBreaks(const char *buf, int indent, int columnWidth, FILE *out) {
 #ifdef ISPC_IS_WINDOWS
     fputs(buf, out);
     fputs("\n", out);
@@ -375,7 +375,7 @@ lPrint(const char *type, bool isError, SourcePos p, const char *fmt,
         return;
     printed.insert(formattedBuf);
 
-    lPrintWithWordBreaks(formattedBuf, indent, lTerminalWidth(), stderr);
+    PrintWithWordBreaks(formattedBuf, indent, TerminalWidth(), stderr);
     lPrintFileLineContext(p);
 
     free(errorBuf);
diff --git a/util.h b/util.h
index b247b8bd..7edf71f7 100644
--- a/util.h
+++ b/util.h
@@ -156,4 +156,18 @@ void GetDirectoryAndFileName(const std::string &currentDir,
 bool VerifyDataLayoutCompatibility(const std::string &module_dl,
                                    const std::string &lib_dl);
 
+/** Print the given string to the given FILE, assuming the given output
+    column width.  Break words as needed to avoid words spilling past the
+    last column.  */
+void PrintWithWordBreaks(const char *buf, int indent, int columnWidth,
+                         FILE *out);
+
+/** Returns the width of the terminal where the compiler is running.
+    Finding this out may fail in a variety of reasonable situations (piping
+    compiler output to 'less', redirecting output to a file, running the
+    compiler under a debuffer; in this case, just return a reasonable
+    default.
+ */
+int TerminalWidth();
+
 #endif // ISPC_UTIL_H