From 1d9201fe3d3172778f2f53caaf2db58ac4ca1b99 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 19 Dec 2011 13:46:50 -0800
Subject: [PATCH] Add "generic" 4, 8, and 16-wide targets.

When used, these targets end up with calls to undefined functions for all
of the various special vector stuff ispc needs to compile ispc programs
(masked store, gather, min/max, sqrt, etc.).

These targets are not yet useful for anything, but are a step toward
having an option to C++ code with calls out to intrinsics.

Reorganized the directory structure a bit and put the LLVM bitcode used
to define target-specific stuff (as well as some generic built-ins stuff)
into a builtins/ directory.

Note that for building on Windows, it's now necessary to set a LLVM_VERSION
environment variable (with values like LLVM_2_9, LLVM_3_0, LLVM_3_1svn, etc.)
---
 Makefile                                      |  64 +-
 bitcode2cpp.py                                |   3 +-
 builtins.cpp                                  |  51 +-
 builtins-c.c => builtins/builtins.c           |   0
 builtins-dispatch.ll => builtins/dispatch.ll  |   0
 .../target-avx-common.ll                      |   3 +
 .../target-avx-x2.ll                          |  14 +-
 builtins-avx.ll => builtins/target-avx.ll     |  14 +-
 builtins/target-generic-16.ll                 |  34 +
 builtins/target-generic-4.ll                  |  34 +
 builtins/target-generic-8.ll                  |  34 +
 builtins/target-generic-common.ll             | 277 ++++++
 .../target-sse2-common.ll                     |   3 +
 .../target-sse2-x2.ll                         |  14 +-
 builtins-sse2.ll => builtins/target-sse2.ll   |  14 +-
 .../target-sse4-common.ll                     |   3 +
 .../target-sse4-x2.ll                         |  14 +-
 builtins-sse4.ll => builtins/target-sse4.ll   |  14 +-
 builtins.m4 => builtins/util.m4               | 847 +++++++++---------
 ctx.cpp                                       |  10 +-
 func.cpp                                      |   5 +-
 ispc.cpp                                      |  52 +-
 ispc.h                                        |  19 +-
 ispc.vcxproj                                  | 165 ++--
 llvmutil.cpp                                  |  27 +-
 module.cpp                                    |  22 +-
 opt.cpp                                       |   2 +-
 parse.yy                                      |   3 +-
 stdlib.ispc                                   | 144 +--
 stdlib2cpp.py                                 |   4 +-
 stmt.cpp                                      |   8 +-
 31 files changed, 1249 insertions(+), 649 deletions(-)
 rename builtins-c.c => builtins/builtins.c (100%)
 rename builtins-dispatch.ll => builtins/dispatch.ll (100%)
 rename builtins-avx-common.ll => builtins/target-avx-common.ll (99%)
 rename builtins-avx-x2.ll => builtins/target-avx-x2.ll (99%)
 rename builtins-avx.ll => builtins/target-avx.ll (99%)
 create mode 100644 builtins/target-generic-16.ll
 create mode 100644 builtins/target-generic-4.ll
 create mode 100644 builtins/target-generic-8.ll
 create mode 100644 builtins/target-generic-common.ll
 rename builtins-sse2-common.ll => builtins/target-sse2-common.ll (99%)
 rename builtins-sse2-x2.ll => builtins/target-sse2-x2.ll (99%)
 rename builtins-sse2.ll => builtins/target-sse2.ll (99%)
 rename builtins-sse4-common.ll => builtins/target-sse4-common.ll (99%)
 rename builtins-sse4-x2.ll => builtins/target-sse4-x2.ll (99%)
 rename builtins-sse4.ll => builtins/target-sse4.ll (99%)
 rename builtins.m4 => builtins/util.m4 (82%)

diff --git a/Makefile b/Makefile
index 54734f39..f2e18543 100644
--- a/Makefile
+++ b/Makefile
@@ -62,14 +62,17 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
 	util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
+BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
+	builtins/dispatch.ll
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
+	builtins-c-32.cpp builtins-c-64.cpp 
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
-	$(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
 
@@ -104,6 +107,10 @@ objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -120,41 +127,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-%.cpp: builtins-%.ll
-	@echo Creating C++ source from builtin definitions file $<
-	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/builtins-%.o: objs/builtins-%.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-32.cpp: builtins-c.c
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@
 
-objs/builtins-c-32.o: objs/builtins-c-32.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-64.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@
 
-objs/builtins-c-64.o: objs/builtins-c-64.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@
 
-objs/stdlib_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for generic
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		./stdlib2cpp.py generic > $@
 
-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
-objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
-objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
-objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
-objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
-objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for x86
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		./stdlib2cpp.py x86 > $@
diff --git a/bitcode2cpp.py b/bitcode2cpp.py
index fa7d4782..a1a5d2bf 100755
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -11,7 +11,8 @@ length=0
 
 src=str(sys.argv[1])
 
-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
+target = re.sub("builtins/", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
diff --git a/builtins.cpp b/builtins.cpp
index 5358e789..9bd41e8f 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
+    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
+        t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
     else if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
@@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
     // symbol creation code below assumes that any LLVM vector of i32s is a
     // varying int32.  Here, we need that to be interpreted as a varying
     // bool, so just have a one-off override for that one...
-    if (name == "__sext_varying_bool") {
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
         const Type *returnType = AtomicType::VaryingInt32;
         std::vector<const Type *> argTypes;
         argTypes.push_back(AtomicType::VaryingBool);
@@ -556,7 +559,7 @@ lSetInternalFunctions(llvm::Module *module) {
     int count = sizeof(names) / sizeof(names[0]);
     for (int i = 0; i < count; ++i) {
         llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL && f->empty() == false)
             f->setLinkage(llvm::GlobalValue::InternalLinkage);
     }
 }
@@ -744,6 +747,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             FATAL("logic error in DefineStdlib");
         }
         break;
+    case Target::GENERIC:
+        switch (g->target.vectorWidth) {
+        case 4:
+            extern unsigned char builtins_bitcode_generic_4[];
+            extern int builtins_bitcode_generic_4_length;
+            AddBitcodeToModule(builtins_bitcode_generic_4, 
+                               builtins_bitcode_generic_4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            extern unsigned char builtins_bitcode_generic_8[];
+            extern int builtins_bitcode_generic_8_length;
+            AddBitcodeToModule(builtins_bitcode_generic_8, 
+                               builtins_bitcode_generic_8_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_generic_16[];
+            extern int builtins_bitcode_generic_16_length;
+            AddBitcodeToModule(builtins_bitcode_generic_16, 
+                               builtins_bitcode_generic_16_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
     default:
         FATAL("logic error");
     }
@@ -771,11 +801,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     if (includeStdlibISPC) {
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
-        // definitions added.  Disable emission of performance warnings for
-        // now, since the user doesn't care about any of that in the stdlib
-        // implementation...
-        extern char stdlib_code[];
-        yy_scan_string(stdlib_code);
-        yyparse();
+        // definitions added.
+        if (g->target.isa == Target::GENERIC) {
+            extern char stdlib_generic_code[];
+            yy_scan_string(stdlib_generic_code);
+            yyparse();
+        }
+        else {
+            extern char stdlib_x86_code[];
+            yy_scan_string(stdlib_x86_code);
+            yyparse();
+        }
     }
 }
diff --git a/builtins-c.c b/builtins/builtins.c
similarity index 100%
rename from builtins-c.c
rename to builtins/builtins.c
diff --git a/builtins-dispatch.ll b/builtins/dispatch.ll
similarity index 100%
rename from builtins-dispatch.ll
rename to builtins/dispatch.ll
diff --git a/builtins-avx-common.ll b/builtins/target-avx-common.ll
similarity index 99%
rename from builtins-avx-common.ll
rename to builtins/target-avx-common.ll
index 6b08466d..07fb12b4 100644
--- a/builtins-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -32,6 +32,9 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins-avx-x2.ll b/builtins/target-avx-x2.ll
similarity index 99%
rename from builtins-avx-x2.ll
rename to builtins/target-avx-x2.ll
index 6254c405..90e2680c 100644
--- a/builtins-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions
 
-stdlib_core(16)
-packed_load_and_store(16)
-scans(16)
-int64minmax(16)
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-avx.ll b/builtins/target-avx.ll
similarity index 99%
rename from builtins-avx.ll
rename to builtins/target-avx.ll
index a00a527e..dc7339bd 100644
--- a/builtins-avx.ll
+++ b/builtins/target-avx.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll
new file mode 100644
index 00000000..807fd242
--- /dev/null
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll
new file mode 100644
index 00000000..7eb1f300
--- /dev/null
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll
new file mode 100644
index 00000000..bd9261ff
--- /dev/null
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
new file mode 100644
index 00000000..b59e8d53
--- /dev/null
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,277 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`MASK',`i1')
+include(`util.m4')
+
+stdlib_core()
+
+scans()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readnone 
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; reductions
+
+declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+
+declare i32 @__reduce_add_uint32(<WIDTH x i32> %v) nounwind readnone 
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+
+declare i64 @__reduce_add_uint64(<WIDTH x i64> %v) nounwind readnone 
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+declare i1 @__reduce_equal_int32(<WIDTH x i32> %v, i32 * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_float(<WIDTH x float> %v, float * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_int64(<WIDTH x i64> %v, i64 * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_double(<WIDTH x double> %v, double * nocapture %samevalue,
+                                  <WIDTH x i1> %mask) nounwind 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(WIDTH, i8, 8)
+load_and_broadcast(WIDTH, i16, 16)
+load_and_broadcast(WIDTH, i32, 32)
+load_and_broadcast(WIDTH, i64, 64)
+
+declare <WIDTH x i8> @__load_masked_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__load_masked_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__load_masked_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__load_masked_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                <WIDTH x i1> %mask) nounwind 
+
+ifelse(LLVM_VERSION,LLVM_3_1svn,`
+define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+                                     <WIDTH x i64>, <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture %ptr,
+                                      <WIDTH x i64> %new, 
+                                      <WIDTH x i1> %mask) nounwind
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture %ptr, <WIDTH x i32> %offsets,
+                        i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture %ptr, <WIDTH x i64> %offsets,
+                        i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                    <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                    <WIDTH x i1> %vecmask) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture %base, <WIDTH x i32> %offsets,
+                  i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture %base, <WIDTH x i64> %offsets,
+                  i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
+                             <WIDTH x i1> %mask) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
+                              <WIDTH x i1> %mask) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(i64)
+
+declare i32 @__packed_load_active(i32 * nocapture %startptr, <WIDTH x i32> * nocapture %val_ptr,
+                                  <WIDTH x i1> %full_mask) nounwind
+declare i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                   <WIDTH x i1> %full_mask) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone
+
diff --git a/builtins-sse2-common.ll b/builtins/target-sse2-common.ll
similarity index 99%
rename from builtins-sse2-common.ll
rename to builtins/target-sse2-common.ll
index 659bdda7..80c34afb 100644
--- a/builtins-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins-sse2-x2.ll b/builtins/target-sse2-x2.ll
similarity index 99%
rename from builtins-sse2-x2.ll
rename to builtins/target-sse2-x2.ll
index b5eaa889..a9d71ea9 100644
--- a/builtins-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-sse2.ll b/builtins/target-sse2.ll
similarity index 99%
rename from builtins-sse2.ll
rename to builtins/target-sse2.ll
index c49d6b2c..1a297199 100644
--- a/builtins-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -33,12 +33,16 @@
 ;; Define the standard library builtins for the SSE2 target
 
 ; Define some basics for a 4-wide target
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
diff --git a/builtins-sse4-common.ll b/builtins/target-sse4-common.ll
similarity index 99%
rename from builtins-sse4-common.ll
rename to builtins/target-sse4-common.ll
index f1ee95dc..19d31ce4 100644
--- a/builtins-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins-sse4-x2.ll b/builtins/target-sse4-x2.ll
similarity index 99%
rename from builtins-sse4-x2.ll
rename to builtins/target-sse4-x2.ll
index fd399884..764f8613 100644
--- a/builtins-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-sse4.ll b/builtins/target-sse4.ll
similarity index 99%
rename from builtins-sse4.ll
rename to builtins/target-sse4.ll
index 68c44a0e..7eadde4b 100644
--- a/builtins-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -33,12 +33,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; Define common 4-wide stuff
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins.m4 b/builtins/util.m4
similarity index 82%
rename from builtins.m4
rename to builtins/util.m4
index f83bdbff..8853e81c 100644
--- a/builtins.m4
+++ b/builtins/util.m4
@@ -550,103 +550,103 @@ divert`'dnl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib_core
 ;;
-;; This macro defines a bunch of helper routines that only depend on the
-;; target's vector width, which it takes as its first parameter.
+;; This macro defines a bunch of helper routines that depend on the
+;; target's vector width
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 define(`shuffles', `
-define <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
-  %v = extractelement <$1 x $2> %0, i32 %1
-  %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0
-forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i
+define <WIDTH x $1> @__broadcast_$2(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %v = extractelement <WIDTH x $1> %0, i32 %1
+  %r_0 = insertelement <WIDTH x $1> undef, $1 %v, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %r_`'i = insertelement <WIDTH x $1> %r_`'eval(i-1), $1 %v, i32 i
 ')
-  ret <$1 x $2> %r_`'eval($1-1)
+  ret <WIDTH x $1> %r_`'eval(WIDTH-1)
 }
 
-define <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+define <WIDTH x $1> @__rotate_$2(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
   %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
   br i1 %isc, label %is_const, label %not_const
 
 is_const:
   ; though verbose, this turms into tight code if %1 is a constant
-forloop(i, 0, eval($1-1), `  
+forloop(i, 0, eval(WIDTH-1), `  
   %delta_`'i = add i32 %1, i
-  %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1)
-  %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i')
+  %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1)
+  %v_`'i = extractelement <WIDTH x $1> %0, i32 %delta_clamped_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
   ; store two instances of the vector into memory
-  %ptr = alloca <$1 x $2>, i32 2
-  %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0
-  store <$1 x $2> %0, <$1 x $2> * %ptr0
-  %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1
-  store <$1 x $2> %0, <$1 x $2> * %ptr1
+  %ptr = alloca <WIDTH x $1>, i32 2
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
 
   ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
-  %offset = and i32 %1, eval($1-1)
-  %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] *
-  %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset
-  %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> *
-  %result = load <$1 x $2> * %load_ptr_vec, align $4
-  ret <$1 x $2> %result
+  %offset = and i32 %1, eval(WIDTH-1)
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(2*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $3
+  ret <WIDTH x $1> %result
 }
 
-define <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
-forloop(i, 0, eval($1-1), `  
-  %index_`'i = extractelement <$1 x i32> %1, i32 i')
-forloop(i, 0, eval($1-1), `  
-  %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i')
+define <WIDTH x $1> @__shuffle_$2(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
+forloop(i, 0, eval(WIDTH-1), `  
+  %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %v_`'i = extractelement <WIDTH x $1> %0, i32 %index_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 }
 
-define <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
-  %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, <eval(2*$1) x i32> <
-      forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1)
+define <WIDTH x $1> @__shuffle2_$2(<WIDTH x $1>, <WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %v2 = shufflevector <WIDTH x $1> %0, <WIDTH x $1> %1, <eval(2*WIDTH) x i32> <
+      forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)
   >
-forloop(i, 0, eval($1-1), `  
-  %index_`'i = extractelement <$1 x i32> %2, i32 i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %index_`'i = extractelement <WIDTH x i32> %2, i32 i')
 
-  %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2)
+  %isc = call i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32> %2)
   br i1 %isc, label %is_const, label %not_const
 
 is_const:
   ; extract from the requested lanes and insert into the result; LLVM turns
   ; this into good code in the end
-forloop(i, 0, eval($1-1), `  
-  %v_`'i = extractelement <eval(2*$1) x $2> %v2, i32 %index_`'i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %v_`'i = extractelement <eval(2*WIDTH) x $1> %v2, i32 %index_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
   ; otherwise store the two vectors onto the stack and then use the given
   ; permutation vector to get indices into that array...
-  %ptr = alloca <eval(2*$1) x $2>
-  store <eval(2*$1) x $2> %v2, <eval(2*$1) x $2> * %ptr
-  %baseptr = bitcast <eval(2*$1) x $2> * %ptr to $2 *
+  %ptr = alloca <eval(2*WIDTH) x $1>
+  store <eval(2*WIDTH) x $1> %v2, <eval(2*WIDTH) x $1> * %ptr
+  %baseptr = bitcast <eval(2*WIDTH) x $1> * %ptr to $1 *
 
-  %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0
-  %val_0 = load $2 * %ptr_0
-  %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0
+  %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0
+  %val_0 = load $1 * %ptr_0
+  %result_0 = insertelement <WIDTH x $1> undef, $1 %val_0, i32 0
 
-forloop(i, 1, eval($1-1), `  
-  %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i
-  %val_`'i = load $2 * %ptr_`'i
-  %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i
+forloop(i, 1, eval(WIDTH-1), `  
+  %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i
+  %val_`'i = load $1 * %ptr_`'i
+  %result_`'i = insertelement <WIDTH x $1> %result_`'eval(i-1), $1 %val_`'i, i32 i
 ')
 
-  ret <$1 x $2> %result_`'eval($1-1)
+  ret <WIDTH x $1> %result_`'eval(WIDTH-1)
 }
 ')
 
@@ -676,18 +676,20 @@ forloop(i, 1, eval($1-1), `
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                        <$1 x i32> %m) nounwind alwaysinline {
+                                        <$1 x MASK> %m) nounwind alwaysinline {
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
   ; for the bit tricks below, we need the mask to be sign extended to be
   ; the size of the element type.
-  ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
-  ifelse($3, `i32', `
-     ; silly workaround to do %mask = %m, which is not possible directly..
-     %maskmem = alloca <$1 x i32>
-     store <$1 x i32> %m, <$1 x i32> * %maskmem
-     %mask = load <$1 x i32> * %maskmem'
+  ifelse(
+    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
+    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
+    $3,i32, `
+       ; silly workaround to do %mask = %m, which is not possible directly..
+       %maskmem = alloca <$1 x i32>
+       store <$1 x i32> %m, <$1 x i32> * %maskmem
+       %mask = load <$1 x i32> * %maskmem'
   )
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
@@ -751,13 +753,13 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
 
 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                          <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x MASK> %mask) nounwind alwaysinline {
   %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
   ret $3 %r
 }
 ', `
 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x MASK> %mask) nounwind alwaysinline {
   %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
   ret $3 %r
 }
@@ -778,11 +780,11 @@ declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
 define(`global_swap', `
 
 define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                          <$1 x MASK> %mask) nounwind alwaysinline {
   %rptr = alloca <$1 x $2>
   %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
 
-  per_lane($1, <$1 x i32> %mask, `
+  per_lane($1, <$1 x MASK> %mask, `
    %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
 ifelse(LLVM_VERSION, `LLVM_2_9',`
    %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
@@ -795,7 +797,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }
 
 define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
-                                                    <$1 x i32> %mask) nounwind alwaysinline {
+                                           <$1 x MASK> %mask) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
  %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -816,11 +818,11 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)')
 
 define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
-                               <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline {
+                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline {
   %rptr = alloca <$1 x $2>
   %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
 
-  per_lane($1, <$1 x i32> %mask, `
+  per_lane($1, <$1 x MASK> %mask, `
    %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
    %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
 ifelse(LLVM_VERSION, `LLVM_2_9',`
@@ -835,7 +837,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }
 
 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
+                               $2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
   %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
   %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -844,6 +846,85 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; count trailing zeros
+
+define(`ctlztz', `
+define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.cttz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.cttz.i64(i64 %0)
+  ret i64 %c
+}
+
+define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.ctlz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.ctlz.i64(i64 %0)
+  ret i64 %c
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+define(`define_prefetches', `
+ifelse(LLVM_VERSION, `LLVM_2_9',
+`
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
+  ret void
+}
+', `
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+                            i32 %cachetype) ; cachetype == 1 is dcache
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
+  ret void
+}
+')
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 
 define(`stdlib_core', `
 
@@ -854,8 +935,8 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
 
-declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
-declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
+declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
+declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
@@ -869,10 +950,10 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
 ;  pass.
 
-declare void @__pseudo_masked_store_8(<$1 x i8> * nocapture, <$1 x i8>, <$1 x i32>)
-declare void @__pseudo_masked_store_16(<$1 x i16> * nocapture, <$1 x i16>, <$1 x i32>)
-declare void @__pseudo_masked_store_32(<$1 x i32> * nocapture, <$1 x i32>, <$1 x i32>)
-declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x i32>)
+declare void @__pseudo_masked_store_8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
 
 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
@@ -904,33 +985,33 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; converts them to native gather functions or converts them to vector
 ; loads, if equivalent.
 
-declare <$1 x i8>  @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather32_8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather32_16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather32_32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather32_64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather64_8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32,
-                                                     <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
+                                                     <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32,
-                                                     <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+                                                     <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -955,94 +1036,94 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
 
-declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32,
-                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32,
-                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
-define i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i8> %0, i32 %1
+define i8 @__extract_int8(<WIDTH x i8>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i8> %0, i32 %1
   ret i8 %extract
 }
 
-define <$1 x i8> @__insert_int8(<$1 x i8>, i32, 
+define <WIDTH x i8> @__insert_int8(<WIDTH x i8>, i32, 
                                            i8) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1
-  ret <$1 x i8> %insert
+  %insert = insertelement <WIDTH x i8> %0, i8 %2, i32 %1
+  ret <WIDTH x i8> %insert
 }
 
-define i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i16> %0, i32 %1
+define i16 @__extract_int16(<WIDTH x i16>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i16> %0, i32 %1
   ret i16 %extract
 }
 
-define <$1 x i16> @__insert_int16(<$1 x i16>, i32, 
+define <WIDTH x i16> @__insert_int16(<WIDTH x i16>, i32, 
                                            i16) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1
-  ret <$1 x i16> %insert
+  %insert = insertelement <WIDTH x i16> %0, i16 %2, i32 %1
+  ret <WIDTH x i16> %insert
 }
 
-define i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i32> %0, i32 %1
+define i32 @__extract_int32(<WIDTH x i32>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i32> %0, i32 %1
   ret i32 %extract
 }
 
-define <$1 x i32> @__insert_int32(<$1 x i32>, i32, 
+define <WIDTH x i32> @__insert_int32(<WIDTH x i32>, i32, 
                                            i32) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1
-  ret <$1 x i32> %insert
+  %insert = insertelement <WIDTH x i32> %0, i32 %2, i32 %1
+  ret <WIDTH x i32> %insert
 }
 
-define i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i64> %0, i32 %1
+define i64 @__extract_int64(<WIDTH x i64>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i64> %0, i32 %1
   ret i64 %extract
 }
 
-define <$1 x i64> @__insert_int64(<$1 x i64>, i32, 
+define <WIDTH x i64> @__insert_int64(<WIDTH x i64>, i32, 
                                            i64) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1
-  ret <$1 x i64> %insert
+  %insert = insertelement <WIDTH x i64> %0, i64 %2, i32 %1
+  ret <WIDTH x i64> %insert
 }
 
-shuffles($1, i8, int8, 1)
-shuffles($1, i16, int16, 2)
-shuffles($1, float, float, 4)
-shuffles($1, i32, int32, 4)
-shuffles($1, double, double, 8)
-shuffles($1, i64, int64, 8)
+shuffles(i8, int8, 1)
+shuffles(i16, int16, 2)
+shuffles(float, float, 4)
+shuffles(i32, int32, 4)
+shuffles(double, double, 8)
+shuffles(i64, int64, 8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; various bitcasts from one type to another
 
-define <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline {
-  %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32>
-  ret <$1 x i32> %float_to_int_bitcast
+define <WIDTH x i32> @__intbits_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <WIDTH x float> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %float_to_int_bitcast
 }
 
 define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
@@ -1050,9 +1131,9 @@ define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
   ret i32 %float_to_int_bitcast
 }
 
-define <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline {
-  %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64>
-  ret <$1 x i64> %double_to_int_bitcast
+define <WIDTH x i64> @__intbits_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <WIDTH x double> %0 to <WIDTH x i64>
+  ret <WIDTH x i64> %double_to_int_bitcast
 }
 
 define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
@@ -1060,9 +1141,9 @@ define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
   ret i64 %double_to_int_bitcast
 }
 
-define <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline {
-  %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float>
-  ret <$1 x float> %int_to_float_bitcast
+define <WIDTH x float> @__floatbits_varying_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <WIDTH x i32> %0 to <WIDTH x float>
+  ret <WIDTH x float> %int_to_float_bitcast
 }
 
 define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
@@ -1070,9 +1151,9 @@ define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
   ret float %int_to_float_bitcast
 }
 
-define <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline {
-  %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double>
-  ret <$1 x double> %int_to_double_bitcast
+define <WIDTH x double> @__doublebits_varying_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <WIDTH x i64> %0 to <WIDTH x double>
+  ret <WIDTH x double> %int_to_double_bitcast
 }
 
 define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
@@ -1080,8 +1161,8 @@ define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
   ret double %int_to_double_bitcast
 }
 
-define <$1 x float> @__undef_varying() nounwind readnone alwaysinline {
-  ret <$1 x float> undef
+define <WIDTH x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <WIDTH x float> undef
 }
 
 define float @__undef_uniform() nounwind readnone alwaysinline {
@@ -1096,31 +1177,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
   ret i32 %r
 }
 
-define <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone alwaysinline {
-  ret <$1 x i32> %0
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; count trailing zeros
-
-define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
-  %c = call i32 @llvm.cttz.i32(i32 %0)
-  ret i32 %c
-}
-
-define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
-  %c = call i64 @llvm.cttz.i64(i64 %0)
-  ret i64 %c
-}
-
-define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
-  %c = call i32 @llvm.ctlz.i32(i32 %0)
-  ret i32 %c
-}
-
-define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
-  %c = call i64 @llvm.ctlz.i64(i64 %0)
-  ret i64 %c
+define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  ifelse(MASK,i1, `
+  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %se
+  ', `
+  ret <WIDTH x i32> %0')
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1670,184 +1732,133 @@ define void
 
 define void
 @__aos_to_soa4_float(float * noalias %p,
-        <$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
-        <$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
+        <WIDTH x float> * noalias %out0, <WIDTH x float> * noalias %out1,
+        <WIDTH x float> * noalias %out2, <WIDTH x float> * noalias %out3)
         nounwind alwaysinline { 
-  %p0 = bitcast float * %p to <$1 x float> *
-  %v0 = load <$1 x float> * %p0, align 4
-  %p1 = getelementptr <$1 x float> * %p0, i32 1
-  %v1 = load <$1 x float> * %p1, align 4
-  %p2 = getelementptr <$1 x float> * %p0, i32 2
-  %v2 = load <$1 x float> * %p2, align 4
-  %p3 = getelementptr <$1 x float> * %p0, i32 3
-  %v3 = load <$1 x float> * %p3, align 4
-  call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
-         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+  %p0 = bitcast float * %p to <WIDTH x float> *
+  %v0 = load <WIDTH x float> * %p0, align 4
+  %p1 = getelementptr <WIDTH x float> * %p0, i32 1
+  %v1 = load <WIDTH x float> * %p1, align 4
+  %p2 = getelementptr <WIDTH x float> * %p0, i32 2
+  %v2 = load <WIDTH x float> * %p2, align 4
+  %p3 = getelementptr <WIDTH x float> * %p0, i32 3
+  %v3 = load <WIDTH x float> * %p3, align 4
+  call void @__aos_to_soa4_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> %v3, <WIDTH x float> * %out0, 
+         <WIDTH x float> * %out1, <WIDTH x float> * %out2, <WIDTH x float> * %out3)
   ret void
 }
 
 
 define void
 @__aos_to_soa4_int32(i32 * noalias %ptr,
-        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
-        <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
+        <WIDTH x i32> * noalias %out0, <WIDTH x i32> * noalias %out1,
+        <WIDTH x i32> * noalias %out2, <WIDTH x i32> * noalias %out3)
         nounwind alwaysinline { 
   %fptr = bitcast i32 * %ptr to float *
-  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
-  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
-  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
-  %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
+  %fout0 = bitcast <WIDTH x i32> * %out0 to <WIDTH x float> *
+  %fout1 = bitcast <WIDTH x i32> * %out1 to <WIDTH x float> *
+  %fout2 = bitcast <WIDTH x i32> * %out2 to <WIDTH x float> *
+  %fout3 = bitcast <WIDTH x i32> * %out3 to <WIDTH x float> *
   call void @__aos_to_soa4_float(float * %fptr, 
-      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, 
-      <$1 x float> * %fout3)
+      <WIDTH x float> * %fout0, <WIDTH x float> * %fout1, <WIDTH x float> * %fout2, 
+      <WIDTH x float> * %fout3)
   ret void
 }
 
 
 define void
-@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             <$1 x float> %v3, float * noalias %p) nounwind alwaysinline { 
-  %out0 = bitcast float * %p to <$1 x float> *
-  %out1 = getelementptr <$1 x float> * %out0, i32 1
-  %out2 = getelementptr <$1 x float> * %out0, i32 2
-  %out3 = getelementptr <$1 x float> * %out0, i32 3
-  call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
-         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+@__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+             <WIDTH x float> %v3, float * noalias %p) nounwind alwaysinline { 
+  %out0 = bitcast float * %p to <WIDTH x float> *
+  %out1 = getelementptr <WIDTH x float> * %out0, i32 1
+  %out2 = getelementptr <WIDTH x float> * %out0, i32 2
+  %out3 = getelementptr <WIDTH x float> * %out0, i32 3
+  call void @__soa_to_aos4_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> %v3, <WIDTH x float> * %out0, 
+         <WIDTH x float> * %out1, <WIDTH x float> * %out2, <WIDTH x float> * %out3)
   ret void
 }
 
 
 define void
-@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             <$1 x i32> %v3, i32 * noalias %base) nounwind alwaysinline { 
-  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
-  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
-  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
-  %fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
+@__soa_to_aos4_int32(<WIDTH x i32> %v0, <WIDTH x i32> %v1, <WIDTH x i32> %v2,
+             <WIDTH x i32> %v3, i32 * noalias %base) nounwind alwaysinline { 
+  %fv0 = bitcast <WIDTH x i32> %v0 to <WIDTH x float>
+  %fv1 = bitcast <WIDTH x i32> %v1 to <WIDTH x float>
+  %fv2 = bitcast <WIDTH x i32> %v2 to <WIDTH x float>
+  %fv3 = bitcast <WIDTH x i32> %v3 to <WIDTH x float>
   %fbase = bitcast i32 * %base to float *
-  call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase)
+  call void @__soa_to_aos4_float(<WIDTH x float> %fv0, <WIDTH x float> %fv1, 
+      <WIDTH x float> %fv2, <WIDTH x float> %fv3, float * %fbase)
   ret void
 }
 
 
 define void
 @__aos_to_soa3_float(float * noalias %p,
-        <$1 x float> * %out0, <$1 x float> * %out1,
-        <$1 x float> * %out2) nounwind alwaysinline { 
-  %p0 = bitcast float * %p to <$1 x float> *
-  %v0 = load <$1 x float> * %p0, align 4
-  %p1 = getelementptr <$1 x float> * %p0, i32 1
-  %v1 = load <$1 x float> * %p1, align 4
-  %p2 = getelementptr <$1 x float> * %p0, i32 2
-  %v2 = load <$1 x float> * %p2, align 4
-  call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
-         <$1 x float> * %out2)
+        <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+        <WIDTH x float> * %out2) nounwind alwaysinline { 
+  %p0 = bitcast float * %p to <WIDTH x float> *
+  %v0 = load <WIDTH x float> * %p0, align 4
+  %p1 = getelementptr <WIDTH x float> * %p0, i32 1
+  %v1 = load <WIDTH x float> * %p1, align 4
+  %p2 = getelementptr <WIDTH x float> * %p0, i32 2
+  %v2 = load <WIDTH x float> * %p2, align 4
+  call void @__aos_to_soa3_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+         <WIDTH x float> * %out2)
   ret void
 }
 
 
 define void
 @__aos_to_soa3_int32(i32 * noalias %base,
-        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
-        <$1 x i32> * noalias %out2) nounwind alwaysinline { 
+        <WIDTH x i32> * noalias %out0, <WIDTH x i32> * noalias %out1,
+        <WIDTH x i32> * noalias %out2) nounwind alwaysinline { 
   %fbase = bitcast i32 * %base to float *
-  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
-  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
-  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
+  %fout0 = bitcast <WIDTH x i32> * %out0 to <WIDTH x float> *
+  %fout1 = bitcast <WIDTH x i32> * %out1 to <WIDTH x float> *
+  %fout2 = bitcast <WIDTH x i32> * %out2 to <WIDTH x float> *
   call void @__aos_to_soa3_float(float * %fbase,
-      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
+      <WIDTH x float> * %fout0, <WIDTH x float> * %fout1, <WIDTH x float> * %fout2)
   ret void
 }
 
 
 define void
-@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
+@__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
                      float * noalias %p) nounwind alwaysinline { 
-  %out0 = bitcast float * %p to <$1 x float> *
-  %out1 = getelementptr <$1 x float> * %out0, i32 1
-  %out2 = getelementptr <$1 x float> * %out0, i32 2
-  call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
-         <$1 x float> * %out2)
+  %out0 = bitcast float * %p to <WIDTH x float> *
+  %out1 = getelementptr <WIDTH x float> * %out0, i32 1
+  %out2 = getelementptr <WIDTH x float> * %out0, i32 2
+  call void @__soa_to_aos3_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+         <WIDTH x float> * %out2)
   ret void
 }
 
 
 define void
-@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
+@__soa_to_aos3_int32(<WIDTH x i32> %v0, <WIDTH x i32> %v1, <WIDTH x i32> %v2,
                      i32 * noalias %base) nounwind alwaysinline { 
-  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
-  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
-  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
+  %fv0 = bitcast <WIDTH x i32> %v0 to <WIDTH x float>
+  %fv1 = bitcast <WIDTH x i32> %v1 to <WIDTH x float>
+  %fv2 = bitcast <WIDTH x i32> %v2 to <WIDTH x float>
   %fbase = bitcast i32 * %base to float *
-  call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, float * %fbase)
+  call void @__soa_to_aos3_float(<WIDTH x float> %fv0, <WIDTH x float> %fv1, 
+      <WIDTH x float> %fv2, float * %fbase)
   ret void
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetching
-
-ifelse(LLVM_VERSION, `LLVM_2_9',
-`
-declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
-
-define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
-  ret void
-}
-
-define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
-  ret void
-}
-
-define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
-  ret void
-}
-', `
-declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
-                            i32 %cachetype) ; cachetype == 1 is dcache
-
-define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
-  ret void
-}
-')
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; assert
 
 declare i32 @printf(i8*, ...)
 declare void @abort() noreturn
 
-define void @__do_assert_uniform(i8 *%str, i1 %test, <$1 x i32> %mask) {
+define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
   br i1 %test, label %ok, label %fail
 
 fail:
@@ -1860,12 +1871,12 @@ ok:
 }
 
 
-define void @__do_assert_varying(i8 *%str, <$1 x i32> %test,
-                                          <$1 x i32> %mask) {
-  %nottest = xor <$1 x i32> %test,
-                 < forloop(i, 1, eval($1-1), `i32 -1, ') i32 -1 >
-  %nottest_and_mask = and <$1 x i32> %nottest, %mask
-  %mm = call i32 @__movmsk(<$1 x i32> %nottest_and_mask)
+define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
+                                          <WIDTH x MASK> %mask) {
+  %nottest = xor <WIDTH x MASK> %test,
+                 < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
+  %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
+  %mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
   %all_ok = icmp eq i32 %mm, 0
   br i1 %all_ok, label %ok, label %fail
 
@@ -2010,118 +2021,118 @@ define void @__memory_barrier() nounwind readnone alwaysinline {
   ret void
 }
 
-global_atomic_associative($1, add, i32, int32, 0)
-global_atomic_associative($1, sub, i32, int32, 0)
-global_atomic_associative($1, and, i32, int32, -1)
-global_atomic_associative($1, or, i32, int32, 0)
-global_atomic_associative($1, xor, i32, int32, 0)
-global_atomic_uniform($1, add, i32, int32)
-global_atomic_uniform($1, sub, i32, int32)
-global_atomic_uniform($1, and, i32, int32)
-global_atomic_uniform($1, or, i32, int32)
-global_atomic_uniform($1, xor, i32, int32)
-global_atomic_uniform($1, min, i32, int32)
-global_atomic_uniform($1, max, i32, int32)
-global_atomic_uniform($1, umin, i32, uint32)
-global_atomic_uniform($1, umax, i32, uint32)
+global_atomic_associative(WIDTH, add, i32, int32, 0)
+global_atomic_associative(WIDTH, sub, i32, int32, 0)
+global_atomic_associative(WIDTH, and, i32, int32, -1)
+global_atomic_associative(WIDTH, or, i32, int32, 0)
+global_atomic_associative(WIDTH, xor, i32, int32, 0)
+global_atomic_uniform(WIDTH, add, i32, int32)
+global_atomic_uniform(WIDTH, sub, i32, int32)
+global_atomic_uniform(WIDTH, and, i32, int32)
+global_atomic_uniform(WIDTH, or, i32, int32)
+global_atomic_uniform(WIDTH, xor, i32, int32)
+global_atomic_uniform(WIDTH, min, i32, int32)
+global_atomic_uniform(WIDTH, max, i32, int32)
+global_atomic_uniform(WIDTH, umin, i32, uint32)
+global_atomic_uniform(WIDTH, umax, i32, uint32)
 
-global_atomic_associative($1, add, i64, int64, 0)
-global_atomic_associative($1, sub, i64, int64, 0)
-global_atomic_associative($1, and, i64, int64, -1)
-global_atomic_associative($1, or, i64, int64, 0)
-global_atomic_associative($1, xor, i64, int64, 0)
-global_atomic_uniform($1, add, i64, int64)
-global_atomic_uniform($1, sub, i64, int64)
-global_atomic_uniform($1, and, i64, int64)
-global_atomic_uniform($1, or, i64, int64)
-global_atomic_uniform($1, xor, i64, int64)
-global_atomic_uniform($1, min, i64, int64)
-global_atomic_uniform($1, max, i64, int64)
-global_atomic_uniform($1, umin, i64, uint64)
-global_atomic_uniform($1, umax, i64, uint64)
+global_atomic_associative(WIDTH, add, i64, int64, 0)
+global_atomic_associative(WIDTH, sub, i64, int64, 0)
+global_atomic_associative(WIDTH, and, i64, int64, -1)
+global_atomic_associative(WIDTH, or, i64, int64, 0)
+global_atomic_associative(WIDTH, xor, i64, int64, 0)
+global_atomic_uniform(WIDTH, add, i64, int64)
+global_atomic_uniform(WIDTH, sub, i64, int64)
+global_atomic_uniform(WIDTH, and, i64, int64)
+global_atomic_uniform(WIDTH, or, i64, int64)
+global_atomic_uniform(WIDTH, xor, i64, int64)
+global_atomic_uniform(WIDTH, min, i64, int64)
+global_atomic_uniform(WIDTH, max, i64, int64)
+global_atomic_uniform(WIDTH, umin, i64, uint64)
+global_atomic_uniform(WIDTH, umax, i64, uint64)
 
-global_swap($1, i32, int32)
-global_swap($1, i64, int64)
+global_swap(WIDTH, i32, int32)
+global_swap(WIDTH, i64, int64)
 
-define <$1 x float> @__atomic_swap_float_global(float * %ptr, <$1 x float> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
-  %ival = bitcast <$1 x float> %val to <$1 x i32>
-  %iret = call <$1 x i32> @__atomic_swap_int32_global(i32 * %iptr, <$1 x i32> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i32> %iret to <$1 x float>
-  ret <$1 x float> %ret
+  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
+  %iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
+  ret <WIDTH x float> %ret
 }
 
-define <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x double> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
-  %ival = bitcast <$1 x double> %val to <$1 x i64>
-  %iret = call <$1 x i64> @__atomic_swap_int64_global(i64 * %iptr, <$1 x i64> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i64> %iret to <$1 x double>
-  ret <$1 x double> %ret
+  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
+  %iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
+  ret <WIDTH x double> %ret
 }
 
 define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %ival = bitcast float %val to i32
-  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
 define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
 
-global_atomic_exchange($1, i32, int32)
-global_atomic_exchange($1, i64, int64)
+global_atomic_exchange(WIDTH, i32, int32)
+global_atomic_exchange(WIDTH, i64, int64)
 
-define <$1 x float> @__atomic_compare_exchange_float_global(float * %ptr,
-                      <$1 x float> %cmp, <$1 x float> %val, <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
+                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
-  %icmp = bitcast <$1 x float> %cmp to <$1 x i32>
-  %ival = bitcast <$1 x float> %val to <$1 x i32>
-  %iret = call <$1 x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <$1 x i32> %icmp,
-                                                                  <$1 x i32> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i32> %iret to <$1 x float>
-  ret <$1 x float> %ret
+  %icmp = bitcast <WIDTH x float> %cmp to <WIDTH x i32>
+  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
+  %iret = call <WIDTH x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <WIDTH x i32> %icmp,
+                                                                  <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
+  ret <WIDTH x float> %ret
 }
 
-define <$1 x double> @__atomic_compare_exchange_double_global(double * %ptr,
-                      <$1 x double> %cmp, <$1 x double> %val, <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
+                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
-  %icmp = bitcast <$1 x double> %cmp to <$1 x i64>
-  %ival = bitcast <$1 x double> %val to <$1 x i64>
-  %iret = call <$1 x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <$1 x i64> %icmp,
-                                                                  <$1 x i64> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i64> %iret to <$1 x double>
-  ret <$1 x double> %ret
+  %icmp = bitcast <WIDTH x double> %cmp to <WIDTH x i64>
+  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
+  %iret = call <WIDTH x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <WIDTH x i64> %icmp,
+                                                                  <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
+  ret <WIDTH x double> %ret
 }
 
 define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %icmp = bitcast float %cmp to i32
   %ival = bitcast float %val to i32
   %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
-                                                                   i32 %ival, <$1 x i32> %mask)
+                                                                   i32 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
 define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                            double %val, <$1 x i32> %mask) nounwind alwaysinline {
+                                            double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %icmp = bitcast double %cmp to i64
   %ival = bitcast double %val to i64
   %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
-                                                                   i64 %ival, <$1 x i32> %mask)
+                                                                   i64 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
@@ -2168,10 +2179,10 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline
 ;; vector width as a parameter
 
 define(`int64minmax', `
-i64minmax($1,min,int64,slt)
-i64minmax($1,max,int64,sgt)
-i64minmax($1,min,uint64,ult)
-i64minmax($1,max,uint64,ugt)
+i64minmax(WIDTH,min,int64,slt)
+i64minmax(WIDTH,max,int64,sgt)
+i64minmax(WIDTH,min,uint64,ult)
+i64minmax(WIDTH,max,uint64,ugt)
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2410,24 +2421,24 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
 
 define(`packed_load_and_store', `
 
-define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr,
-                                 <$1 x i32> %full_mask) nounwind alwaysinline {
+define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
+                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
   ;; everyone wants to load, so just load an entire vector width in a single
   ;; vector load
-  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
-  %vec_load = load <$1 x i32> *%vecptr, align 4
-  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
-  ret i32 $1
+  %vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
+  %vec_load = load <WIDTH x i32> *%vecptr, align 4
+  store <WIDTH x i32> %vec_load, <WIDTH x i32> * %val_ptr, align 4
+  ret i32 WIDTH
 
 unknown_mask:
   br label %loop
@@ -2445,7 +2456,7 @@ loop:
 load:
   %loadptr = getelementptr i32 *%startptr, i32 %offset
   %loadval = load i32 *%loadptr
-  %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 *
+  %val_ptr_i32 = bitcast <WIDTH x i32> * %val_ptr to i32 *
   %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
   store i32 %loadval, i32 *%storeptr
   %offset1 = add i32 %offset, 1
@@ -2457,28 +2468,28 @@ loopend:
   %nextlanemask = mul i32 %lanemask, 2
 
   ; are we done yet?
-  %test = icmp ne i32 %nextlane, $1
+  %test = icmp ne i32 %nextlane, WIDTH
   br i1 %test, label %loop, label %done
 
 done:
   ret i32 %nextoffset
 }
 
-define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals,
-                                  <$1 x i32> %full_mask) nounwind alwaysinline {
+define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
-  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
-  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
-  ret i32 $1
+  %vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
+  store <WIDTH x i32> %vals, <WIDTH x i32> * %vecptr, align 4
+  ret i32 WIDTH
 
 unknown_mask:
   br label %loop
@@ -2494,7 +2505,7 @@ loop:
   br i1 %do_store, label %store, label %loopend 
 
 store:
-  %storeval = extractelement <$1 x i32> %vals, i32 %lane
+  %storeval = extractelement <WIDTH x i32> %vals, i32 %lane
   %storeptr = getelementptr i32 *%startptr, i32 %offset
   store i32 %storeval, i32 *%storeptr
   %offset1 = add i32 %offset, 1
@@ -2506,7 +2517,7 @@ loopend:
   %nextlanemask = mul i32 %lanemask, 2
 
   ; are we done yet?
-  %test = icmp ne i32 %nextlane, $1
+  %test = icmp ne i32 %nextlane, WIDTH
   br i1 %test, label %loop, label %done
 
 done:
@@ -2613,7 +2624,7 @@ reduce_equal_aux($1, double, double, i64, fcmp, 64)
 
 define(`exclusive_scan', `
 define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
-                                  <$1 x i32> %mask) nounwind alwaysinline {
+                                  <$1 x MASK> %mask) nounwind alwaysinline {
   ; first, set the value of any off lanes to the identity value
   %ptr = alloca <$1 x $2>
   %idvec1 = bitcast $2 $5 to <1 x $2>
@@ -2623,7 +2634,7 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
   %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
   %vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
   call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
-                                     <$1 x i32> %mask)
+                                     <$1 x MASK> %mask)
   %v_id = load <$1 x $2> * %ptr
 
   ; extract elements of the vector to use in computing the scan
@@ -2649,16 +2660,16 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
 ')
 
 define(`scans', `
-exclusive_scan($1, i32, 32, add, 0, add_i32)
-exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
-exclusive_scan($1, i64, 64, add, 0, add_i64)
-exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
+exclusive_scan(WIDTH, i32, 32, add, 0, add_i32)
+exclusive_scan(WIDTH, float, 32, fadd, zeroinitializer, add_float)
+exclusive_scan(WIDTH, i64, 64, add, 0, add_i64)
+exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double)
 
-exclusive_scan($1, i32, 32, and, -1, and_i32)
-exclusive_scan($1, i64, 64, and, -1, and_i64)
+exclusive_scan(WIDTH, i32, 32, and, -1, and_i32)
+exclusive_scan(WIDTH, i64, 64, and, -1, and_i64)
 
-exclusive_scan($1, i32, 32, or, 0, or_i32)
-exclusive_scan($1, i64, 64, or, 0, or_i64)
+exclusive_scan(WIDTH, i32, 32, or, 0, or_i32)
+exclusive_scan(WIDTH, i64, 64, or, 0, or_i64)
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/ctx.cpp b/ctx.cpp
index 043f7acc..694a3b1d 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -875,8 +875,11 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     // into an i32 value
     std::vector<Symbol *> mm;
     m->symbolTable->LookupFunction("__movmsk", &mm);
-    // There should be one with signed int signature, one unsigned int.
-    Assert(mm.size() == 2); 
+    if (g->target.maskBitCount == 1)
+        Assert(mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        Assert(mm.size() == 2); 
     // We can actually call either one, since both are i32s as far as
     // LLVM's type system is concerned...
     llvm::Function *fmm = mm[0]->function;
@@ -929,6 +932,9 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         return NULL;
     }
 
+    if (g->target.maskBitCount == 1)
+        return b;
+
     LLVM_TYPE_CONST llvm::ArrayType *at = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
     if (at) {
diff --git a/func.cpp b/func.cpp
index 61dfb784..4c8d2222 100644
--- a/func.cpp
+++ b/func.cpp
@@ -288,7 +288,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         bool checkMask = (type->isTask == true) || 
             ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
              costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        if (checkMask && g->opt.disableCoherentControlFlow == false) {
+        checkMask &= (g->target.maskingIsFree == false);
+        checkMask &= (g->opt.disableCoherentControlFlow == false);
+
+        if (checkMask) {
             llvm::Value *mask = ctx->GetFunctionMask();
             llvm::Value *allOn = ctx->All(mask);
             llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
diff --git a/ispc.cpp b/ispc.cpp
index 8bfc9a9d..8cc618c3 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -129,24 +129,60 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse2-x2")) {
         t->isa = Target::SSE2;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4")) {
         t->isa = Target::SSE4;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
         t->isa = Target::SSE4;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+    else if (!strcasecmp(isa, "generic-4")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-8")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-16")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
     }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     else if (!strcasecmp(isa, "avx")) {
@@ -154,12 +190,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "avx-x2")) {
         t->isa = Target::AVX;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 16;
         t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
 #endif // LLVM 3.0+
 #if defined(LLVM_3_1svn)
@@ -168,12 +210,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         t->isa = Target::AVX2;
         t->nativeVectorWidth = 16;
         t->vectorWidth = 16;
         t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
 #endif // LLVM 3.1
     else {
@@ -221,7 +269,7 @@ Target::SupportedTargetISAs() {
 #ifdef LLVM_3_1svn
         ", avx2, avx2-x2"
 #endif // LLVM_3_1svn
-        ;
+        ", generic-4, generic-8, generic-16";
 }
 
 
@@ -300,6 +348,8 @@ Target::GetISAString() const {
         return "avx";
     case Target::AVX2:
         return "avx2";
+    case Target::GENERIC:
+        return "generic";
     default:
         FATAL("Unhandled target in GetISAString()");
     }
diff --git a/ispc.h b/ispc.h
index 6eb2cdd9..254c8311 100644
--- a/ispc.h
+++ b/ispc.h
@@ -193,7 +193,7 @@ struct Target {
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };
 
     /** Instruction set being compiled to. */
     ISA isa;
@@ -222,6 +222,23 @@ struct Target {
 
     /** Indicates whether position independent code should be generated. */
     bool generatePIC;
+
+    /** Is there overhead associated with masking on the target
+        architecture; e.g. there is on SSE, due to extra blends and the
+        like, but there isn't with an ISA that supports masking
+        natively. */
+    bool maskingIsFree;
+
+    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
+        gather trick assumes that at least one program instance is running
+        (so that it can safely assume that the array base pointer is
+        valid). */
+    bool allOffMaskIsSafe;
+
+    /** How many bits are used to store each element of the mask: e.g. this
+        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
+        the generic target. */
+    int maskBitCount;
 };
 
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index fb56b96c..96a6855d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,11 +22,15 @@
     <ClCompile Include="gen-bitcode-c-32.cpp" />
     <ClCompile Include="gen-bitcode-c-64.cpp" />
     <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-4.cpp" />
+    <ClCompile Include="gen-bitcode-generic-8.cpp" />
+    <ClCompile Include="gen-bitcode-generic-16.cpp" />
     <ClCompile Include="gen-bitcode-sse2.cpp" />
     <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
     <ClCompile Include="gen-bitcode-sse4.cpp" />
     <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
-    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="gen-stdlib-generic.cpp" />
+    <ClCompile Include="gen-stdlib-x86.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="lex.cc">
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -40,15 +44,15 @@
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
     </ClCompile>
-    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+    <CustomBuild Include="builtins\builtins.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building builtins.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building builtins.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
     </CustomBuild>
     <ClCompile Include="stmt.cpp" />
     <ClCompile Include="sym.cpp" />
@@ -75,105 +79,148 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse4.ll">
+    <CustomBuild Include="builtins\dispatch.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-dispatch.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse4-x2.ll">
+    <CustomBuild Include="builtins\target-sse4.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse2.ll">
+    <CustomBuild Include="builtins\target-sse2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse2-x2.ll">
+    <CustomBuild Include="builtins\target-sse2-x2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-avx.ll">
+    <CustomBuild Include="builtins\target-avx.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx-x2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-8.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-16.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 6c440a91..4a50e337 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -105,11 +105,14 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    // Note that both the mask and bool vectors are vector of int32s
-    // (not i1s).  LLVM ends up generating much better SSE code with
-    // this representation.
-    LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
-        llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    if (target.maskBitCount == 1)
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    else {
+        assert(target.maskBitCount == 32);
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    }
 
     LLVMTypes::Int1VectorType = 
         llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
@@ -141,7 +144,11 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
+    if (target.maskBitCount == 1)
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
+                                        false /*unsigned*/); // 0x1
+    else
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
 
     for (int i = 0; i < target.vectorWidth; ++i)
@@ -150,8 +157,12 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
-                                     true /*signed*/);
+    if (target.maskBitCount == 1)
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
+                                         true /*signed*/);
+    else
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
+                                         true /*signed*/);
 
     for (int i = 0; i < target.vectorWidth; ++i)
         maskZeros.push_back(offMask);
diff --git a/module.cpp b/module.cpp
index 9fade4b9..5dc9b160 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1158,22 +1158,14 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     opts.addMacroDef("PI=3.1415926535");
 
     // Add #define for current compilation target
-    switch (g->target.isa) {
-    case Target::SSE2:
-        opts.addMacroDef("ISPC_TARGET_SSE2");
-        break;
-    case Target::SSE4:
-        opts.addMacroDef("ISPC_TARGET_SSE4");
-        break;
-    case Target::AVX:
-        opts.addMacroDef("ISPC_TARGET_AVX");
-        break;
-    case Target::AVX2:
-        opts.addMacroDef("ISPC_TARGET_AVX2");
-        break;
-    default:
-        FATAL("Unhandled target ISA in preprocessor symbol definition");
+    char targetMacro[128];
+    sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString());
+    char *p = targetMacro;
+    while (*p) {
+        *p = toupper(*p);
+        ++p;
     }
+    opts.addMacroDef(targetMacro);
 
     if (g->target.is32Bit)
         opts.addMacroDef("ISPC_POINTER_SIZE=32");
diff --git a/opt.cpp b/opt.cpp
index c77a76f7..17458a06 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2444,7 +2444,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     int count = sizeof(names) / sizeof(names[0]);
     for (int i = 0; i < count; ++i) {
         llvm::Function *f = m->module->getFunction(names[i]);
-        if (f != NULL) {
+        if (f != NULL && f->empty() == false) {
             f->setLinkage(llvm::GlobalValue::InternalLinkage);
             modifiedAny = true;
         }
diff --git a/parse.yy b/parse.yy
index 8510244a..70cb2b3f 100644
--- a/parse.yy
+++ b/parse.yy
@@ -1605,7 +1605,8 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = AtomicType::VaryingConstUInt32;
+    const Type *t = g->target.isa == Target::GENERIC ?
+        AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 1a804733..c3b02fa7 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,6 +38,14 @@
            ispc code 
 */
 
+#ifdef ISPC_TARGET_GENERIC
+#define IntMaskType bool
+#define UIntMaskType bool
+#else
+#define IntMaskType int32
+#define UIntMaskType unsigned int32
+#endif
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 
@@ -274,13 +282,21 @@ static inline int32 sign_extend(bool v) {
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
+#ifdef ISPC_TARGET_GENERIC
+    return __movmsk(v & __mask) != 0;
+#else
     return __movmsk(__sext_varying_bool(v) & __mask) != 0;
+#endif
 }
 
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
+#ifdef ISPC_TARGET_GENERIC
+    bool match = ((v & __mask) == __mask);
+#else
     int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
+#endif
     return __movmsk(match) == (1 << programCount) - 1;
 }
 
@@ -308,7 +324,11 @@ static inline int popcnt(int64 v) {
 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
+#ifdef ISPC_TARGET_GENERIC
+    return __popcnt_int32(__movmsk(v & __mask));
+#else
     return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+#endif
 }
 
 static inline uniform int lanemask() {
@@ -672,19 +692,19 @@ static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) {
     return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }
 
-REDUCE_EQUAL(int32, int32, int32)
-REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
-REDUCE_EQUAL(float, float, int32)
-REDUCE_EQUAL(int64, int64, int32)
-REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
-REDUCE_EQUAL(double, double, int32)
+REDUCE_EQUAL(int32, int32, IntMaskType)
+REDUCE_EQUAL(unsigned int32, int32, UIntMaskType)
+REDUCE_EQUAL(float, float, IntMaskType)
+REDUCE_EQUAL(int64, int64, IntMaskType)
+REDUCE_EQUAL(unsigned int64, int64, UIntMaskType)
+REDUCE_EQUAL(double, double, IntMaskType)
 
 static int32 exclusive_scan_add(int32 v) {
-    return __exclusive_scan_add_i32(v, (int32)__mask);
+    return __exclusive_scan_add_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_add(unsigned int32 v) {
-    return __exclusive_scan_add_i32(v, __mask);
+    return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask);
 }
 
 static float exclusive_scan_add(float v) {
@@ -692,11 +712,11 @@ static float exclusive_scan_add(float v) {
 }
 
 static int64 exclusive_scan_add(int64 v) {
-    return __exclusive_scan_add_i64(v, (int32)__mask);
+    return __exclusive_scan_add_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_add(unsigned int64 v) {
-    return __exclusive_scan_add_i64(v, __mask);
+    return __exclusive_scan_add_i64(v, (UIntMaskType)__mask);
 }
 
 static double exclusive_scan_add(double v) {
@@ -704,35 +724,35 @@ static double exclusive_scan_add(double v) {
 }
 
 static int32 exclusive_scan_and(int32 v) {
-    return __exclusive_scan_and_i32(v, (int32)__mask);
+    return __exclusive_scan_and_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_and(unsigned int32 v) {
-    return __exclusive_scan_and_i32(v, __mask);
+    return __exclusive_scan_and_i32(v, (UIntMaskType)__mask);
 }
 
 static int64 exclusive_scan_and(int64 v) {
-    return __exclusive_scan_and_i64(v, (int32)__mask);
+    return __exclusive_scan_and_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_and(unsigned int64 v) {
-    return __exclusive_scan_and_i64(v, __mask);
+    return __exclusive_scan_and_i64(v, (UIntMaskType)__mask);
 }
 
 static int32 exclusive_scan_or(int32 v) {
-    return __exclusive_scan_or_i32(v, (int32)__mask);
+    return __exclusive_scan_or_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_or(unsigned int32 v) {
-    return __exclusive_scan_or_i32(v, __mask);
+    return __exclusive_scan_or_i32(v, (UIntMaskType)__mask);
 }
 
 static int64 exclusive_scan_or(int64 v) {
-    return __exclusive_scan_or_i64(v, (int32)__mask);
+    return __exclusive_scan_or_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_or(unsigned int64 v) {
-    return __exclusive_scan_or_i64(v, __mask);
+    return __exclusive_scan_or_i64(v, (UIntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -741,23 +761,23 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int * uniform a,
                    unsigned int * uniform vals) {
-    return __packed_load_active(a, vals, (unsigned int32)__mask);
+    return __packed_load_active(a, vals, (UIntMaskType)__mask);
 }
 
 static inline uniform int
 packed_store_active(uniform unsigned int * uniform a,
                     unsigned int vals) {
-    return __packed_store_active(a, vals, (unsigned int32)__mask);
+    return __packed_store_active(a, vals, (UIntMaskType)__mask);
 }
 
 static inline uniform int 
 packed_load_active(uniform int * uniform a, int * uniform vals) {
-    return __packed_load_active(a, vals, (int32)__mask);
+    return __packed_load_active(a, vals, (IntMaskType)__mask);
 }
 
 static inline uniform int 
 packed_store_active(uniform int * uniform a, int vals) {
-    return __packed_store_active(a, vals, (int32)__mask);
+    return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -848,49 +868,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
     return ret;                                                         \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
-DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
-DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
 
-DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
+DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
-DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
-DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
 
-DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
+DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
 
 #undef DEFINE_ATOMIC_OP
 
@@ -913,12 +933,12 @@ static inline uniform TA atomic_compare_exchange_global(               \
     return ret;                                                         \
 }
 
-ATOMIC_DECL_CMPXCHG(int32, int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
-ATOMIC_DECL_CMPXCHG(float, float, int32)
-ATOMIC_DECL_CMPXCHG(int64, int64, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
-ATOMIC_DECL_CMPXCHG(double, double, int32)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
 
 #undef ATOMIC_DECL_CMPXCHG
 
diff --git a/stdlib2cpp.py b/stdlib2cpp.py
index 132f8257..6fa5fc2e 100755
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -2,7 +2,9 @@
 
 import sys
 
-print "char stdlib_code[] = { "
+t=str(sys.argv[1])
+
+print "char stdlib_" + t + "_code[] = { "
 
 for line in sys.stdin:
     for c in line:
diff --git a/stmt.cpp b/stmt.cpp
index e799fc0b..95142abe 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -622,9 +622,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
 /** Given an AST node, check to see if it's safe if we happen to run the
     code for that node with the execution mask all off.
-
-    FIXME: this is actually a target-specific thing; for non SSE/AVX
-    targets with more complete masking support, some of this won't apply...
  */
 static bool
 lCheckAllOffSafety(ASTNode *node, void *data) {
@@ -648,6 +645,11 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
         return false;
     }
 
+    if (g->target.allOffMaskIsSafe == true)
+        // Don't worry about memory accesses if we have a target that can
+        // safely run them with the mask all off
+        return true;
+
     IndexExpr *ie;
     if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
         const Type *type = ie->baseExpr->GetType();