diff --git a/Makefile b/Makefile
index 54734f39..f2e18543 100644
--- a/Makefile
+++ b/Makefile
@@ -62,14 +62,17 @@ CXX_SRC=ast.cpp builtins.cpp ctx.cpp decl.cpp expr.cpp func.cpp ispc.cpp \
 	util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-BUILTINS_SRC=builtins-avx.ll builtins-avx-x2.ll builtins-sse2.ll builtins-sse2-x2.ll \
-	builtins-sse4.ll builtins-sse4-x2.ll builtins-dispatch.ll
+TARGETS=avx avx-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 generic-16
+BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
+	builtins/dispatch.ll
+BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
+	builtins-c-32.cpp builtins-c-64.cpp 
 BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
-OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_SRC:.ll=.o) \
-	builtins-c-32.o builtins-c-64.o stdlib_ispc.o $(BISON_SRC:.yy=.o) \
-	$(FLEX_SRC:.ll=.o))
+OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
+	stdlib_generic_ispc.o stdlib_x86_ispc.o \
+	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
 
@@ -104,6 +107,10 @@ objs/%.o: %.cpp
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
+objs/%.o: objs/%.cpp
+	@echo Compiling $<
+	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+
 objs/parse.cc: parse.yy
 	@echo Running bison on $<
 	@$(YACC) -o $@ $<
@@ -120,41 +127,24 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-%.cpp: builtins-%.ll
-	@echo Creating C++ source from builtin definitions file $<
-	@m4 -DLLVM_VERSION=$(LLVM_VERSION) builtins.m4 $< | ./bitcode2cpp.py $< > $@
-
-objs/builtins-%.o: objs/builtins-%.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-32.cpp: builtins-c.c
+objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-32.c > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | ./bitcode2cpp.py $< > $@
 
-objs/builtins-c-32.o: objs/builtins-c-32.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-c-64.cpp: builtins-c.c
+objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
-	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py builtins-c-64.c > $@
+	@$(CLANG) -m32 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-32 > $@
 
-objs/builtins-c-64.o: objs/builtins-c-64.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
+objs/builtins-c-64.cpp: builtins/builtins.c
+	@echo Creating C++ source from builtins definition file $<
+	@$(CLANG) -m64 -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py c-64 > $@
 
-objs/stdlib_ispc.cpp: stdlib.ispc
-	@echo Creating C++ source from $<
-	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | ./stdlib2cpp.py > $@
+objs/stdlib_generic_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for generic
+	@$(CLANG) -E -x c -DISPC_TARGET_GENERIC=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		./stdlib2cpp.py generic > $@
 
-objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
-	@echo Compiling $<
-	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-
-objs/builtins-sse2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2.ll
-objs/builtins-sse2-x2.cpp: builtins.m4 builtins-sse2-common.ll builtins-sse2-x2.ll
-objs/builtins-sse4.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4.ll
-objs/builtins-sse4-x2.cpp: builtins.m4 builtins-sse4-common.ll builtins-sse4-x2.ll
-objs/builtins-avx.cpp: builtins.m4 builtins-avx-common.ll builtins-avx.ll
-objs/builtins-avx-x2.cpp: builtins.m4 builtins-avx-common.ll builtins-avx-x2.ll
+objs/stdlib_x86_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for x86
+	@$(CLANG) -E -x c -DISPC=1 -DPI=3.1415926536 $< -o - | \
+		./stdlib2cpp.py x86 > $@
diff --git a/bitcode2cpp.py b/bitcode2cpp.py
index fa7d4782..a1a5d2bf 100755
--- a/bitcode2cpp.py
+++ b/bitcode2cpp.py
@@ -11,7 +11,8 @@ length=0
 
 src=str(sys.argv[1])
 
-target = re.sub(".*builtins-", "", src)
+target = re.sub("builtins/target-", "", src)
+target = re.sub("builtins/", "", target)
 target = re.sub("\.ll$", "", target)
 target = re.sub("\.c$", "", target)
 target = re.sub("-", "_", target)
diff --git a/builtins.cpp b/builtins.cpp
index 5358e789..9bd41e8f 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -99,6 +99,9 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
         return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
 
     // varying
+    if (LLVMTypes::MaskType != LLVMTypes::Int32VectorType &&
+        t == LLVMTypes::MaskType)
+        return AtomicType::VaryingBool;
     else if (t == LLVMTypes::Int8VectorType)
         return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
     else if (t == LLVMTypes::Int16VectorType)
@@ -194,7 +197,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
     // symbol creation code below assumes that any LLVM vector of i32s is a
     // varying int32.  Here, we need that to be interpreted as a varying
     // bool, so just have a one-off override for that one...
-    if (name == "__sext_varying_bool") {
+    if (g->target.maskBitCount != 1 && name == "__sext_varying_bool") {
         const Type *returnType = AtomicType::VaryingInt32;
         std::vector<const Type *> argTypes;
         argTypes.push_back(AtomicType::VaryingBool);
@@ -556,7 +559,7 @@ lSetInternalFunctions(llvm::Module *module) {
     int count = sizeof(names) / sizeof(names[0]);
     for (int i = 0; i < count; ++i) {
         llvm::Function *f = module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL && f->empty() == false)
             f->setLinkage(llvm::GlobalValue::InternalLinkage);
     }
 }
@@ -744,6 +747,33 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             FATAL("logic error in DefineStdlib");
         }
         break;
+    case Target::GENERIC:
+        switch (g->target.vectorWidth) {
+        case 4:
+            extern unsigned char builtins_bitcode_generic_4[];
+            extern int builtins_bitcode_generic_4_length;
+            AddBitcodeToModule(builtins_bitcode_generic_4, 
+                               builtins_bitcode_generic_4_length, 
+                               module, symbolTable);
+            break;
+        case 8:
+            extern unsigned char builtins_bitcode_generic_8[];
+            extern int builtins_bitcode_generic_8_length;
+            AddBitcodeToModule(builtins_bitcode_generic_8, 
+                               builtins_bitcode_generic_8_length, 
+                               module, symbolTable);
+            break;
+        case 16:
+            extern unsigned char builtins_bitcode_generic_16[];
+            extern int builtins_bitcode_generic_16_length;
+            AddBitcodeToModule(builtins_bitcode_generic_16, 
+                               builtins_bitcode_generic_16_length, 
+                               module, symbolTable);
+            break;
+        default:
+            FATAL("logic error in DefineStdlib");
+        }
+        break;
     default:
         FATAL("logic error");
     }
@@ -771,11 +801,16 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     if (includeStdlibISPC) {
         // If the user wants the standard library to be included, parse the
         // serialized version of the stdlib.ispc file to get its
-        // definitions added.  Disable emission of performance warnings for
-        // now, since the user doesn't care about any of that in the stdlib
-        // implementation...
-        extern char stdlib_code[];
-        yy_scan_string(stdlib_code);
-        yyparse();
+        // definitions added.
+        if (g->target.isa == Target::GENERIC) {
+            extern char stdlib_generic_code[];
+            yy_scan_string(stdlib_generic_code);
+            yyparse();
+        }
+        else {
+            extern char stdlib_x86_code[];
+            yy_scan_string(stdlib_x86_code);
+            yyparse();
+        }
     }
 }
diff --git a/builtins-c.c b/builtins/builtins.c
similarity index 100%
rename from builtins-c.c
rename to builtins/builtins.c
diff --git a/builtins-dispatch.ll b/builtins/dispatch.ll
similarity index 100%
rename from builtins-dispatch.ll
rename to builtins/dispatch.ll
diff --git a/builtins-avx-common.ll b/builtins/target-avx-common.ll
similarity index 99%
rename from builtins-avx-common.ll
rename to builtins/target-avx-common.ll
index 6b08466d..07fb12b4 100644
--- a/builtins-avx-common.ll
+++ b/builtins/target-avx-common.ll
@@ -32,6 +32,9 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; AVX target implementation.
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins-avx-x2.ll b/builtins/target-avx-x2.ll
similarity index 99%
rename from builtins-avx-x2.ll
rename to builtins/target-avx-x2.ll
index 6254c405..90e2680c 100644
--- a/builtins-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 16-wide definitions
 
-stdlib_core(16)
-packed_load_and_store(16)
-scans(16)
-int64minmax(16)
+define(`WIDTH',`16')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-avx.ll b/builtins/target-avx.ll
similarity index 99%
rename from builtins-avx.ll
rename to builtins/target-avx.ll
index a00a527e..dc7339bd 100644
--- a/builtins-avx.ll
+++ b/builtins/target-avx.ll
@@ -32,12 +32,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 8-wide definitions
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-avx-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins/target-generic-16.ll b/builtins/target-generic-16.ll
new file mode 100644
index 00000000..807fd242
--- /dev/null
+++ b/builtins/target-generic-16.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-4.ll b/builtins/target-generic-4.ll
new file mode 100644
index 00000000..7eb1f300
--- /dev/null
+++ b/builtins/target-generic-4.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`4')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-8.ll b/builtins/target-generic-8.ll
new file mode 100644
index 00000000..bd9261ff
--- /dev/null
+++ b/builtins/target-generic-8.ll
@@ -0,0 +1,34 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+include(`target-generic-common.ll')
+
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
new file mode 100644
index 00000000..b59e8d53
--- /dev/null
+++ b/builtins/target-generic-common.ll
@@ -0,0 +1,277 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`MASK',`i1')
+include(`util.m4')
+
+stdlib_core()
+
+scans()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+declare void @__fastmath() nounwind 
+
+;; round/floor/ceil
+
+declare float @__round_uniform_float(float) nounwind readnone 
+declare float @__floor_uniform_float(float) nounwind readnone 
+declare float @__ceil_uniform_float(float) nounwind readnone 
+
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+declare <WIDTH x float> @__round_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__floor_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__ceil_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;; min/max
+
+declare float @__max_uniform_float(float, float) nounwind readnone 
+declare float @__min_uniform_float(float, float) nounwind readnone 
+declare i32 @__min_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_int32(i32, i32) nounwind readnone 
+declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone 
+declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone 
+declare i64 @__min_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_int64(i64, i64) nounwind readnone 
+declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone 
+declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone 
+declare double @__min_uniform_double(double, double) nounwind readnone 
+declare double @__max_uniform_double(double, double) nounwind readnone 
+
+declare <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                             <WIDTH x float>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone 
+declare <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone
+declare <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                               <WIDTH x double>) nounwind readnone 
+
+;; sqrt/rsqrt/rcp
+
+declare float @__rsqrt_uniform_float(float) nounwind readnone 
+declare float @__rcp_uniform_float(float) nounwind readnone 
+declare float @__sqrt_uniform_float(float) nounwind readnone 
+declare <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone 
+declare <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %v) nounwind readnone 
+declare <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone 
+
+declare double @__sqrt_uniform_double(double) nounwind readnone
+declare <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone
+
+;; bit ops
+
+declare i32 @__popcnt_int32(i32) nounwind readnone
+declare i64 @__popcnt_int64(i64) nounwind readnone 
+
+declare i32 @__count_trailing_zeros_i32(i32) nounwind readnone
+declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
+declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
+declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
+
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; reductions
+
+declare i32 @__movmsk(<WIDTH x i1>) nounwind readnone 
+
+declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
+declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone 
+declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone 
+
+declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone 
+
+declare i32 @__reduce_add_uint32(<WIDTH x i32> %v) nounwind readnone 
+declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone 
+declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone 
+
+declare double @__reduce_add_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_min_double(<WIDTH x double>) nounwind readnone 
+declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone 
+
+declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone 
+
+declare i64 @__reduce_add_uint64(<WIDTH x i64> %v) nounwind readnone 
+declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone 
+declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone 
+
+declare i1 @__reduce_equal_int32(<WIDTH x i32> %v, i32 * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_float(<WIDTH x float> %v, float * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_int64(<WIDTH x i64> %v, i64 * nocapture %samevalue,
+                                 <WIDTH x i1> %mask) nounwind 
+declare i1 @__reduce_equal_double(<WIDTH x double> %v, double * nocapture %samevalue,
+                                  <WIDTH x i1> %mask) nounwind 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+load_and_broadcast(WIDTH, i8, 8)
+load_and_broadcast(WIDTH, i16, 16)
+load_and_broadcast(WIDTH, i32, 32)
+load_and_broadcast(WIDTH, i64, 64)
+
+declare <WIDTH x i8> @__load_masked_8(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i16> @__load_masked_16(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i32> @__load_masked_32(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+declare <WIDTH x i64> @__load_masked_64(i8 * nocapture, <WIDTH x i1> %mask) nounwind readonly
+
+declare void @__masked_store_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                <WIDTH x i1>) nounwind 
+declare void @__masked_store_64(<WIDTH x i64>* nocapture, <WIDTH x i64>,
+                                <WIDTH x i1> %mask) nounwind 
+
+ifelse(LLVM_VERSION,LLVM_3_1svn,`
+define void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i8> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i8> %1, <WIDTH x i8> %v
+  store <WIDTH x i8> %v1, <WIDTH x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i16> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i16> %1, <WIDTH x i16> %v
+  store <WIDTH x i16> %v1, <WIDTH x i16> * %0
+  ret void
+}
+
+define void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                     <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i32> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i32> %1, <WIDTH x i32> %v
+  store <WIDTH x i32> %v1, <WIDTH x i32> * %0
+  ret void
+}
+
+define void @__masked_store_blend_64(<WIDTH x i64>* nocapture,
+                                     <WIDTH x i64>, <WIDTH x i1>) nounwind {
+  %v = load <WIDTH x i64> * %0
+  %v1 = select <WIDTH x i1> %2, <WIDTH x i64> %1, <WIDTH x i64> %v
+  store <WIDTH x i64> %v1, <WIDTH x i64> * %0
+  ret void
+}
+',`
+declare void @__masked_store_blend_8(<WIDTH x i8>* nocapture, <WIDTH x i8>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_16(<WIDTH x i16>* nocapture, <WIDTH x i16>, 
+                                     <WIDTH x i1>) nounwind 
+declare void @__masked_store_blend_32(<WIDTH x i32>* nocapture, <WIDTH x i32>, 
+                                      <WIDTH x i1>) nounwind
+declare void @__masked_store_blend_64(<WIDTH x i64>* nocapture %ptr,
+                                      <WIDTH x i64> %new, 
+                                      <WIDTH x i1> %mask) nounwind
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather/scatter
+
+define(`gather_scatter', `
+declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture %ptr, <WIDTH x i32> %offsets,
+                        i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture %ptr, <WIDTH x i64> %offsets,
+                        i32 %offset_scale, <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs, 
+                                    <WIDTH x i1> %vecmask) nounwind readonly 
+declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs, 
+                                    <WIDTH x i1> %vecmask) nounwind readonly 
+
+declare void @__scatter_base_offsets32_$1(i8* nocapture %base, <WIDTH x i32> %offsets,
+                  i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind 
+declare void @__scatter_base_offsets64_$1(i8* nocapture %base, <WIDTH x i64> %offsets,
+                  i32 %offset_scale, <WIDTH x $1> %values, <WIDTH x i1> %mask) nounwind 
+declare void @__scatter32_$1(<WIDTH x i32> %ptrs, <WIDTH x $1> %values,
+                             <WIDTH x i1> %mask) nounwind 
+declare void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
+                              <WIDTH x i1> %mask) nounwind 
+')
+
+gather_scatter(i8)
+gather_scatter(i16)
+gather_scatter(i32)
+gather_scatter(i64)
+
+declare i32 @__packed_load_active(i32 * nocapture %startptr, <WIDTH x i32> * nocapture %val_ptr,
+                                  <WIDTH x i1> %full_mask) nounwind
+declare i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                   <WIDTH x i1> %full_mask) nounwind
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+declare void @__prefetch_read_uniform_1(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_2(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_3(i8 *) nounwind readnone
+declare void @__prefetch_read_uniform_nt(i8 *) nounwind readnone
+
diff --git a/builtins-sse2-common.ll b/builtins/target-sse2-common.ll
similarity index 99%
rename from builtins-sse2-common.ll
rename to builtins/target-sse2-common.ll
index 659bdda7..80c34afb 100644
--- a/builtins-sse2-common.ll
+++ b/builtins/target-sse2-common.ll
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 
diff --git a/builtins-sse2-x2.ll b/builtins/target-sse2-x2.ll
similarity index 99%
rename from builtins-sse2-x2.ll
rename to builtins/target-sse2-x2.ll
index b5eaa889..a9d71ea9 100644
--- a/builtins-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-sse2.ll b/builtins/target-sse2.ll
similarity index 99%
rename from builtins-sse2.ll
rename to builtins/target-sse2.ll
index c49d6b2c..1a297199 100644
--- a/builtins-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -33,12 +33,16 @@
 ;; Define the standard library builtins for the SSE2 target
 
 ; Define some basics for a 4-wide target
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse2-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse2-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding
diff --git a/builtins-sse4-common.ll b/builtins/target-sse4-common.ll
similarity index 99%
rename from builtins-sse4-common.ll
rename to builtins/target-sse4-common.ll
index f1ee95dc..19d31ce4 100644
--- a/builtins-sse4-common.ll
+++ b/builtins/target-sse4-common.ll
@@ -29,6 +29,9 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
+ctlztz()
+define_prefetches()
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 
diff --git a/builtins-sse4-x2.ll b/builtins/target-sse4-x2.ll
similarity index 99%
rename from builtins-sse4-x2.ll
rename to builtins/target-sse4-x2.ll
index fd399884..764f8613 100644
--- a/builtins-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -36,12 +36,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; standard 8-wide definitions from m4 macros
 
-stdlib_core(8)
-packed_load_and_store(8)
-scans(8)
-int64minmax(8)
+define(`WIDTH',`8')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins-sse4.ll b/builtins/target-sse4.ll
similarity index 99%
rename from builtins-sse4.ll
rename to builtins/target-sse4.ll
index 68c44a0e..7eadde4b 100644
--- a/builtins-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -33,12 +33,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ; Define common 4-wide stuff
-stdlib_core(4)
-packed_load_and_store(4)
-scans(4)
-int64minmax(4)
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
 
-include(`builtins-sse4-common.ll')
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-sse4-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
diff --git a/builtins.m4 b/builtins/util.m4
similarity index 82%
rename from builtins.m4
rename to builtins/util.m4
index f83bdbff..8853e81c 100644
--- a/builtins.m4
+++ b/builtins/util.m4
@@ -550,103 +550,103 @@ divert`'dnl
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib_core
 ;;
-;; This macro defines a bunch of helper routines that only depend on the
-;; target's vector width, which it takes as its first parameter.
+;; This macro defines a bunch of helper routines that depend on the
+;; target's vector width
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 define(`shuffles', `
-define <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
-  %v = extractelement <$1 x $2> %0, i32 %1
-  %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0
-forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i
+define <WIDTH x $1> @__broadcast_$2(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
+  %v = extractelement <WIDTH x $1> %0, i32 %1
+  %r_0 = insertelement <WIDTH x $1> undef, $1 %v, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %r_`'i = insertelement <WIDTH x $1> %r_`'eval(i-1), $1 %v, i32 i
 ')
-  ret <$1 x $2> %r_`'eval($1-1)
+  ret <WIDTH x $1> %r_`'eval(WIDTH-1)
 }
 
-define <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+define <WIDTH x $1> @__rotate_$2(<WIDTH x $1>, i32) nounwind readnone alwaysinline {
   %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1)
   br i1 %isc, label %is_const, label %not_const
 
 is_const:
   ; though verbose, this turms into tight code if %1 is a constant
-forloop(i, 0, eval($1-1), `  
+forloop(i, 0, eval(WIDTH-1), `  
   %delta_`'i = add i32 %1, i
-  %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1)
-  %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i')
+  %delta_clamped_`'i = and i32 %delta_`'i, eval(WIDTH-1)
+  %v_`'i = extractelement <WIDTH x $1> %0, i32 %delta_clamped_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
   ; store two instances of the vector into memory
-  %ptr = alloca <$1 x $2>, i32 2
-  %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0
-  store <$1 x $2> %0, <$1 x $2> * %ptr0
-  %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1
-  store <$1 x $2> %0, <$1 x $2> * %ptr1
+  %ptr = alloca <WIDTH x $1>, i32 2
+  %ptr0 = getelementptr <WIDTH x $1> * %ptr, i32 0
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr0
+  %ptr1 = getelementptr <WIDTH x $1> * %ptr, i32 1
+  store <WIDTH x $1> %0, <WIDTH x $1> * %ptr1
 
   ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
-  %offset = and i32 %1, eval($1-1)
-  %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] *
-  %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset
-  %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> *
-  %result = load <$1 x $2> * %load_ptr_vec, align $4
-  ret <$1 x $2> %result
+  %offset = and i32 %1, eval(WIDTH-1)
+  %ptr_as_elt_array = bitcast <WIDTH x $1> * %ptr to [eval(2*WIDTH) x $1] *
+  %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $1 * %load_ptr to <WIDTH x $1> *
+  %result = load <WIDTH x $1> * %load_ptr_vec, align $3
+  ret <WIDTH x $1> %result
 }
 
-define <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
-forloop(i, 0, eval($1-1), `  
-  %index_`'i = extractelement <$1 x i32> %1, i32 i')
-forloop(i, 0, eval($1-1), `  
-  %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i')
+define <WIDTH x $1> @__shuffle_$2(<WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
+forloop(i, 0, eval(WIDTH-1), `  
+  %index_`'i = extractelement <WIDTH x i32> %1, i32 i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %v_`'i = extractelement <WIDTH x $1> %0, i32 %index_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 }
 
-define <$1 x $2> @__shuffle2_$3(<$1 x $2>, <$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
-  %v2 = shufflevector <$1 x $2> %0, <$1 x $2> %1, <eval(2*$1) x i32> <
-      forloop(i, 0, eval(2*$1-2), `i32 i, ') i32 eval(2*$1-1)
+define <WIDTH x $1> @__shuffle2_$2(<WIDTH x $1>, <WIDTH x $1>, <WIDTH x i32>) nounwind readnone alwaysinline {
+  %v2 = shufflevector <WIDTH x $1> %0, <WIDTH x $1> %1, <eval(2*WIDTH) x i32> <
+      forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1)
   >
-forloop(i, 0, eval($1-1), `  
-  %index_`'i = extractelement <$1 x i32> %2, i32 i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %index_`'i = extractelement <WIDTH x i32> %2, i32 i')
 
-  %isc = call i1 @__is_compile_time_constant_varying_int32(<$1 x i32> %2)
+  %isc = call i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32> %2)
   br i1 %isc, label %is_const, label %not_const
 
 is_const:
   ; extract from the requested lanes and insert into the result; LLVM turns
   ; this into good code in the end
-forloop(i, 0, eval($1-1), `  
-  %v_`'i = extractelement <eval(2*$1) x $2> %v2, i32 %index_`'i')
+forloop(i, 0, eval(WIDTH-1), `  
+  %v_`'i = extractelement <eval(2*WIDTH) x $1> %v2, i32 %index_`'i')
 
-  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
-forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+  %ret_0 = insertelement <WIDTH x $1> undef, $1 %v_0, i32 0
+forloop(i, 1, eval(WIDTH-1), `  %ret_`'i = insertelement <WIDTH x $1> %ret_`'eval(i-1), $1 %v_`'i, i32 i
 ')
-  ret <$1 x $2> %ret_`'eval($1-1)
+  ret <WIDTH x $1> %ret_`'eval(WIDTH-1)
 
 not_const:
   ; otherwise store the two vectors onto the stack and then use the given
   ; permutation vector to get indices into that array...
-  %ptr = alloca <eval(2*$1) x $2>
-  store <eval(2*$1) x $2> %v2, <eval(2*$1) x $2> * %ptr
-  %baseptr = bitcast <eval(2*$1) x $2> * %ptr to $2 *
+  %ptr = alloca <eval(2*WIDTH) x $1>
+  store <eval(2*WIDTH) x $1> %v2, <eval(2*WIDTH) x $1> * %ptr
+  %baseptr = bitcast <eval(2*WIDTH) x $1> * %ptr to $1 *
 
-  %ptr_0 = getelementptr $2 * %baseptr, i32 %index_0
-  %val_0 = load $2 * %ptr_0
-  %result_0 = insertelement <$1 x $2> undef, $2 %val_0, i32 0
+  %ptr_0 = getelementptr $1 * %baseptr, i32 %index_0
+  %val_0 = load $1 * %ptr_0
+  %result_0 = insertelement <WIDTH x $1> undef, $1 %val_0, i32 0
 
-forloop(i, 1, eval($1-1), `  
-  %ptr_`'i = getelementptr $2 * %baseptr, i32 %index_`'i
-  %val_`'i = load $2 * %ptr_`'i
-  %result_`'i = insertelement <$1 x $2> %result_`'eval(i-1), $2 %val_`'i, i32 i
+forloop(i, 1, eval(WIDTH-1), `  
+  %ptr_`'i = getelementptr $1 * %baseptr, i32 %index_`'i
+  %val_`'i = load $1 * %ptr_`'i
+  %result_`'i = insertelement <WIDTH x $1> %result_`'eval(i-1), $1 %val_`'i, i32 i
 ')
 
-  ret <$1 x $2> %result_`'eval($1-1)
+  ret <WIDTH x $1> %result_`'eval(WIDTH-1)
 }
 ')
 
@@ -676,18 +676,20 @@ forloop(i, 1, eval($1-1), `
 define(`global_atomic_associative', `
 
 define <$1 x $3> @__atomic_$2_$4_global($3 * %ptr, <$1 x $3> %val,
-                                        <$1 x i32> %m) nounwind alwaysinline {
+                                        <$1 x MASK> %m) nounwind alwaysinline {
   ; first, for any lanes where the mask is off, compute a vector where those lanes
   ; hold the identity value..
 
   ; for the bit tricks below, we need the mask to be sign extended to be
   ; the size of the element type.
-  ifelse($3, `i64', `%mask = sext <$1 x i32> %m to <$1 x i64>')
-  ifelse($3, `i32', `
-     ; silly workaround to do %mask = %m, which is not possible directly..
-     %maskmem = alloca <$1 x i32>
-     store <$1 x i32> %m, <$1 x i32> * %maskmem
-     %mask = load <$1 x i32> * %maskmem'
+  ifelse(
+    MASK,i1,`%mask = sext <$1 x MASK> %m to <$1 x $3>',
+    $3,i64, `%mask = sext <$1 x MASK> %m to <$1 x i64>',
+    $3,i32, `
+       ; silly workaround to do %mask = %m, which is not possible directly..
+       %maskmem = alloca <$1 x i32>
+       store <$1 x i32> %m, <$1 x i32> * %maskmem
+       %mask = load <$1 x i32> * %maskmem'
   )
   ; zero out any lanes that are off
   %valoff = and <$1 x $3> %val, %mask
@@ -751,13 +753,13 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %delta)
 
 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                          <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x MASK> %mask) nounwind alwaysinline {
   %r = call $3 @llvm.atomic.load.$2.$3.p0$3($3 * %ptr, $3 %val)
   ret $3 %r
 }
 ', `
 define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
+                                         <$1 x MASK> %mask) nounwind alwaysinline {
   %r = atomicrmw $2 $3 * %ptr, $3 %val seq_cst
   ret $3 %r
 }
@@ -778,11 +780,11 @@ declare i64 @llvm.atomic.swap.i64.p0i64(i64 * %ptr, i64 %val)')
 define(`global_swap', `
 
 define <$1 x $2> @__atomic_swap_$3_global($2* %ptr, <$1 x $2> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                          <$1 x MASK> %mask) nounwind alwaysinline {
   %rptr = alloca <$1 x $2>
   %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
 
-  per_lane($1, <$1 x i32> %mask, `
+  per_lane($1, <$1 x MASK> %mask, `
    %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
 ifelse(LLVM_VERSION, `LLVM_2_9',`
    %r_LANE_ID = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val_LANE_ID)', `
@@ -795,7 +797,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }
 
 define $2 @__atomic_swap_uniform_$3_global($2* %ptr, $2 %val,
-                                                    <$1 x i32> %mask) nounwind alwaysinline {
+                                           <$1 x MASK> %mask) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
  %r = call $2 @llvm.atomic.swap.$2.p0$2($2 * %ptr, $2 %val)', `
  %r = atomicrmw xchg $2 * %ptr, $2 %val seq_cst')
@@ -816,11 +818,11 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 declare $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)')
 
 define <$1 x $2> @__atomic_compare_exchange_$3_global($2* %ptr, <$1 x $2> %cmp,
-                               <$1 x $2> %val, <$1 x i32> %mask) nounwind alwaysinline {
+                               <$1 x $2> %val, <$1 x MASK> %mask) nounwind alwaysinline {
   %rptr = alloca <$1 x $2>
   %rptr32 = bitcast <$1 x $2> * %rptr to $2 *
 
-  per_lane($1, <$1 x i32> %mask, `
+  per_lane($1, <$1 x MASK> %mask, `
    %cmp_LANE_ID = extractelement <$1 x $2> %cmp, i32 LANE
    %val_LANE_ID = extractelement <$1 x $2> %val, i32 LANE
 ifelse(LLVM_VERSION, `LLVM_2_9',`
@@ -835,7 +837,7 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 }
 
 define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
-                               $2 %val, <$1 x i32> %mask) nounwind alwaysinline {
+                               $2 %val, <$1 x MASK> %mask) nounwind alwaysinline {
 ifelse(LLVM_VERSION, `LLVM_2_9',`
   %r = call $2 @llvm.atomic.cmp.swap.$2.p0$2($2 * %ptr, $2 %cmp, $2 %val)', `
   %r = cmpxchg $2 * %ptr, $2 %cmp, $2 %val seq_cst')
@@ -844,6 +846,85 @@ ifelse(LLVM_VERSION, `LLVM_2_9',`
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; count trailing zeros
+
+define(`ctlztz', `
+define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.cttz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.cttz.i64(i64 %0)
+  ret i64 %c
+}
+
+define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
+  %c = call i32 @llvm.ctlz.i32(i32 %0)
+  ret i32 %c
+}
+
+define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
+  %c = call i64 @llvm.ctlz.i64(i64 %0)
+  ret i64 %c
+}
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetching
+
+define(`define_prefetches', `
+ifelse(LLVM_VERSION, `LLVM_2_9',
+`
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
+  ret void
+}
+', `
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+                            i32 %cachetype) ; cachetype == 1 is dcache
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
+  ret void
+}
+')
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 
 define(`stdlib_core', `
 
@@ -854,8 +935,8 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind
 
-declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
-declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
+declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
+declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
 
 ; This function declares placeholder masked store functions for the
 ;  front-end to use.
@@ -869,10 +950,10 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;  stores (if the mask is all on) by the MaskedStoreOptPass optimization
 ;  pass.
 
-declare void @__pseudo_masked_store_8(<$1 x i8> * nocapture, <$1 x i8>, <$1 x i32>)
-declare void @__pseudo_masked_store_16(<$1 x i16> * nocapture, <$1 x i16>, <$1 x i32>)
-declare void @__pseudo_masked_store_32(<$1 x i32> * nocapture, <$1 x i32>, <$1 x i32>)
-declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x i32>)
+declare void @__pseudo_masked_store_8(<WIDTH x i8> * nocapture, <WIDTH x i8>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_16(<WIDTH x i16> * nocapture, <WIDTH x i16>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_32(<WIDTH x i32> * nocapture, <WIDTH x i32>, <WIDTH x MASK>)
+declare void @__pseudo_masked_store_64(<WIDTH x i64> * nocapture, <WIDTH x i64>, <WIDTH x MASK>)
 
 ; Declare the pseudo-gather functions.  When the ispc front-end needs
 ; to perform a gather, it generates a call to one of these functions,
@@ -904,33 +985,33 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; converts them to native gather functions or converts them to vector
 ; loads, if equivalent.
 
-declare <$1 x i8>  @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather32_8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather32_16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather32_32(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather32_64(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather64_8(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather64_16(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather64_32(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather64_64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, i32,
-                                                     <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, i32,
-                                                      <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <WIDTH x i32>, i32,
+                                                     <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets32_16(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather_base_offsets32_32(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather_base_offsets32_64(i8 *, <WIDTH x i32>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, i32,
-                                                     <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
-                                                      <$1 x i32>) nounwind readonly
+declare <WIDTH x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <WIDTH x i64>, i32,
+                                                     <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i16> @__pseudo_gather_base_offsets64_16(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i32> @__pseudo_gather_base_offsets64_32(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
+declare <WIDTH x i64> @__pseudo_gather_base_offsets64_64(i8 *, <WIDTH x i64>, i32,
+                                                      <WIDTH x MASK>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -955,94 +1036,94 @@ declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, i32,
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
 
-declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_32(<WIDTH x i32>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter32_64(<WIDTH x i32>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_8(<WIDTH x i64>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_16(<WIDTH x i64>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_32(<WIDTH x i64>, <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter64_64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>, i32,
-                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>, i32,
-                                                 <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <WIDTH x i32>, i32,
+                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <WIDTH x i32>, i32,
+                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
-declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>, i32,
-                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>, i32,
-                                                 <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <WIDTH x i64>, i32,
+                                                <WIDTH x i8>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i16>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i32>, <WIDTH x MASK>) nounwind
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <WIDTH x i64>, i32,
+                                                 <WIDTH x i64>, <WIDTH x MASK>) nounwind
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
-define i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i8> %0, i32 %1
+define i8 @__extract_int8(<WIDTH x i8>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i8> %0, i32 %1
   ret i8 %extract
 }
 
-define <$1 x i8> @__insert_int8(<$1 x i8>, i32, 
+define <WIDTH x i8> @__insert_int8(<WIDTH x i8>, i32, 
                                            i8) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1
-  ret <$1 x i8> %insert
+  %insert = insertelement <WIDTH x i8> %0, i8 %2, i32 %1
+  ret <WIDTH x i8> %insert
 }
 
-define i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i16> %0, i32 %1
+define i16 @__extract_int16(<WIDTH x i16>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i16> %0, i32 %1
   ret i16 %extract
 }
 
-define <$1 x i16> @__insert_int16(<$1 x i16>, i32, 
+define <WIDTH x i16> @__insert_int16(<WIDTH x i16>, i32, 
                                            i16) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1
-  ret <$1 x i16> %insert
+  %insert = insertelement <WIDTH x i16> %0, i16 %2, i32 %1
+  ret <WIDTH x i16> %insert
 }
 
-define i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i32> %0, i32 %1
+define i32 @__extract_int32(<WIDTH x i32>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i32> %0, i32 %1
   ret i32 %extract
 }
 
-define <$1 x i32> @__insert_int32(<$1 x i32>, i32, 
+define <WIDTH x i32> @__insert_int32(<WIDTH x i32>, i32, 
                                            i32) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1
-  ret <$1 x i32> %insert
+  %insert = insertelement <WIDTH x i32> %0, i32 %2, i32 %1
+  ret <WIDTH x i32> %insert
 }
 
-define i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x i64> %0, i32 %1
+define i64 @__extract_int64(<WIDTH x i64>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <WIDTH x i64> %0, i32 %1
   ret i64 %extract
 }
 
-define <$1 x i64> @__insert_int64(<$1 x i64>, i32, 
+define <WIDTH x i64> @__insert_int64(<WIDTH x i64>, i32, 
                                            i64) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1
-  ret <$1 x i64> %insert
+  %insert = insertelement <WIDTH x i64> %0, i64 %2, i32 %1
+  ret <WIDTH x i64> %insert
 }
 
-shuffles($1, i8, int8, 1)
-shuffles($1, i16, int16, 2)
-shuffles($1, float, float, 4)
-shuffles($1, i32, int32, 4)
-shuffles($1, double, double, 8)
-shuffles($1, i64, int64, 8)
+shuffles(i8, int8, 1)
+shuffles(i16, int16, 2)
+shuffles(float, float, 4)
+shuffles(i32, int32, 4)
+shuffles(double, double, 8)
+shuffles(i64, int64, 8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; various bitcasts from one type to another
 
-define <$1 x i32> @__intbits_varying_float(<$1 x float>) nounwind readnone alwaysinline {
-  %float_to_int_bitcast = bitcast <$1 x float> %0 to <$1 x i32>
-  ret <$1 x i32> %float_to_int_bitcast
+define <WIDTH x i32> @__intbits_varying_float(<WIDTH x float>) nounwind readnone alwaysinline {
+  %float_to_int_bitcast = bitcast <WIDTH x float> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %float_to_int_bitcast
 }
 
 define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
@@ -1050,9 +1131,9 @@ define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline {
   ret i32 %float_to_int_bitcast
 }
 
-define <$1 x i64> @__intbits_varying_double(<$1 x double>) nounwind readnone alwaysinline {
-  %double_to_int_bitcast = bitcast <$1 x double> %0 to <$1 x i64>
-  ret <$1 x i64> %double_to_int_bitcast
+define <WIDTH x i64> @__intbits_varying_double(<WIDTH x double>) nounwind readnone alwaysinline {
+  %double_to_int_bitcast = bitcast <WIDTH x double> %0 to <WIDTH x i64>
+  ret <WIDTH x i64> %double_to_int_bitcast
 }
 
 define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
@@ -1060,9 +1141,9 @@ define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline {
   ret i64 %double_to_int_bitcast
 }
 
-define <$1 x float> @__floatbits_varying_int32(<$1 x i32>) nounwind readnone alwaysinline {
-  %int_to_float_bitcast = bitcast <$1 x i32> %0 to <$1 x float>
-  ret <$1 x float> %int_to_float_bitcast
+define <WIDTH x float> @__floatbits_varying_int32(<WIDTH x i32>) nounwind readnone alwaysinline {
+  %int_to_float_bitcast = bitcast <WIDTH x i32> %0 to <WIDTH x float>
+  ret <WIDTH x float> %int_to_float_bitcast
 }
 
 define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
@@ -1070,9 +1151,9 @@ define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline {
   ret float %int_to_float_bitcast
 }
 
-define <$1 x double> @__doublebits_varying_int64(<$1 x i64>) nounwind readnone alwaysinline {
-  %int_to_double_bitcast = bitcast <$1 x i64> %0 to <$1 x double>
-  ret <$1 x double> %int_to_double_bitcast
+define <WIDTH x double> @__doublebits_varying_int64(<WIDTH x i64>) nounwind readnone alwaysinline {
+  %int_to_double_bitcast = bitcast <WIDTH x i64> %0 to <WIDTH x double>
+  ret <WIDTH x double> %int_to_double_bitcast
 }
 
 define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
@@ -1080,8 +1161,8 @@ define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline {
   ret double %int_to_double_bitcast
 }
 
-define <$1 x float> @__undef_varying() nounwind readnone alwaysinline {
-  ret <$1 x float> undef
+define <WIDTH x float> @__undef_varying() nounwind readnone alwaysinline {
+  ret <WIDTH x float> undef
 }
 
 define float @__undef_uniform() nounwind readnone alwaysinline {
@@ -1096,31 +1177,12 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
   ret i32 %r
 }
 
-define <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone alwaysinline {
-  ret <$1 x i32> %0
-}
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; count trailing zeros
-
-define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline {
-  %c = call i32 @llvm.cttz.i32(i32 %0)
-  ret i32 %c
-}
-
-define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline {
-  %c = call i64 @llvm.cttz.i64(i64 %0)
-  ret i64 %c
-}
-
-define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline {
-  %c = call i32 @llvm.ctlz.i32(i32 %0)
-  ret i32 %c
-}
-
-define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline {
-  %c = call i64 @llvm.ctlz.i64(i64 %0)
-  ret i64 %c
+define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  ifelse(MASK,i1, `
+  %se = sext <WIDTH x i1> %0 to <WIDTH x i32>
+  ret <WIDTH x i32> %se
+  ', `
+  ret <WIDTH x i32> %0')
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1670,184 +1732,133 @@ define void
 
 define void
 @__aos_to_soa4_float(float * noalias %p,
-        <$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
-        <$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
+        <WIDTH x float> * noalias %out0, <WIDTH x float> * noalias %out1,
+        <WIDTH x float> * noalias %out2, <WIDTH x float> * noalias %out3)
         nounwind alwaysinline { 
-  %p0 = bitcast float * %p to <$1 x float> *
-  %v0 = load <$1 x float> * %p0, align 4
-  %p1 = getelementptr <$1 x float> * %p0, i32 1
-  %v1 = load <$1 x float> * %p1, align 4
-  %p2 = getelementptr <$1 x float> * %p0, i32 2
-  %v2 = load <$1 x float> * %p2, align 4
-  %p3 = getelementptr <$1 x float> * %p0, i32 3
-  %v3 = load <$1 x float> * %p3, align 4
-  call void @__aos_to_soa4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
-         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+  %p0 = bitcast float * %p to <WIDTH x float> *
+  %v0 = load <WIDTH x float> * %p0, align 4
+  %p1 = getelementptr <WIDTH x float> * %p0, i32 1
+  %v1 = load <WIDTH x float> * %p1, align 4
+  %p2 = getelementptr <WIDTH x float> * %p0, i32 2
+  %v2 = load <WIDTH x float> * %p2, align 4
+  %p3 = getelementptr <WIDTH x float> * %p0, i32 3
+  %v3 = load <WIDTH x float> * %p3, align 4
+  call void @__aos_to_soa4_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> %v3, <WIDTH x float> * %out0, 
+         <WIDTH x float> * %out1, <WIDTH x float> * %out2, <WIDTH x float> * %out3)
   ret void
 }
 
 
 define void
 @__aos_to_soa4_int32(i32 * noalias %ptr,
-        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
-        <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
+        <WIDTH x i32> * noalias %out0, <WIDTH x i32> * noalias %out1,
+        <WIDTH x i32> * noalias %out2, <WIDTH x i32> * noalias %out3)
         nounwind alwaysinline { 
   %fptr = bitcast i32 * %ptr to float *
-  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
-  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
-  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
-  %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
+  %fout0 = bitcast <WIDTH x i32> * %out0 to <WIDTH x float> *
+  %fout1 = bitcast <WIDTH x i32> * %out1 to <WIDTH x float> *
+  %fout2 = bitcast <WIDTH x i32> * %out2 to <WIDTH x float> *
+  %fout3 = bitcast <WIDTH x i32> * %out3 to <WIDTH x float> *
   call void @__aos_to_soa4_float(float * %fptr, 
-      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, 
-      <$1 x float> * %fout3)
+      <WIDTH x float> * %fout0, <WIDTH x float> * %fout1, <WIDTH x float> * %fout2, 
+      <WIDTH x float> * %fout3)
   ret void
 }
 
 
 define void
-@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             <$1 x float> %v3, float * noalias %p) nounwind alwaysinline { 
-  %out0 = bitcast float * %p to <$1 x float> *
-  %out1 = getelementptr <$1 x float> * %out0, i32 1
-  %out2 = getelementptr <$1 x float> * %out0, i32 2
-  %out3 = getelementptr <$1 x float> * %out0, i32 3
-  call void @__soa_to_aos4_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> %v3, <$1 x float> * %out0, 
-         <$1 x float> * %out1, <$1 x float> * %out2, <$1 x float> * %out3)
+@__soa_to_aos4_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
+             <WIDTH x float> %v3, float * noalias %p) nounwind alwaysinline { 
+  %out0 = bitcast float * %p to <WIDTH x float> *
+  %out1 = getelementptr <WIDTH x float> * %out0, i32 1
+  %out2 = getelementptr <WIDTH x float> * %out0, i32 2
+  %out3 = getelementptr <WIDTH x float> * %out0, i32 3
+  call void @__soa_to_aos4_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> %v3, <WIDTH x float> * %out0, 
+         <WIDTH x float> * %out1, <WIDTH x float> * %out2, <WIDTH x float> * %out3)
   ret void
 }
 
 
 define void
-@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             <$1 x i32> %v3, i32 * noalias %base) nounwind alwaysinline { 
-  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
-  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
-  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
-  %fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
+@__soa_to_aos4_int32(<WIDTH x i32> %v0, <WIDTH x i32> %v1, <WIDTH x i32> %v2,
+             <WIDTH x i32> %v3, i32 * noalias %base) nounwind alwaysinline { 
+  %fv0 = bitcast <WIDTH x i32> %v0 to <WIDTH x float>
+  %fv1 = bitcast <WIDTH x i32> %v1 to <WIDTH x float>
+  %fv2 = bitcast <WIDTH x i32> %v2 to <WIDTH x float>
+  %fv3 = bitcast <WIDTH x i32> %v3 to <WIDTH x float>
   %fbase = bitcast i32 * %base to float *
-  call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase)
+  call void @__soa_to_aos4_float(<WIDTH x float> %fv0, <WIDTH x float> %fv1, 
+      <WIDTH x float> %fv2, <WIDTH x float> %fv3, float * %fbase)
   ret void
 }
 
 
 define void
 @__aos_to_soa3_float(float * noalias %p,
-        <$1 x float> * %out0, <$1 x float> * %out1,
-        <$1 x float> * %out2) nounwind alwaysinline { 
-  %p0 = bitcast float * %p to <$1 x float> *
-  %v0 = load <$1 x float> * %p0, align 4
-  %p1 = getelementptr <$1 x float> * %p0, i32 1
-  %v1 = load <$1 x float> * %p1, align 4
-  %p2 = getelementptr <$1 x float> * %p0, i32 2
-  %v2 = load <$1 x float> * %p2, align 4
-  call void @__aos_to_soa3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
-         <$1 x float> * %out2)
+        <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+        <WIDTH x float> * %out2) nounwind alwaysinline { 
+  %p0 = bitcast float * %p to <WIDTH x float> *
+  %v0 = load <WIDTH x float> * %p0, align 4
+  %p1 = getelementptr <WIDTH x float> * %p0, i32 1
+  %v1 = load <WIDTH x float> * %p1, align 4
+  %p2 = getelementptr <WIDTH x float> * %p0, i32 2
+  %v2 = load <WIDTH x float> * %p2, align 4
+  call void @__aos_to_soa3_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+         <WIDTH x float> * %out2)
   ret void
 }
 
 
 define void
 @__aos_to_soa3_int32(i32 * noalias %base,
-        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
-        <$1 x i32> * noalias %out2) nounwind alwaysinline { 
+        <WIDTH x i32> * noalias %out0, <WIDTH x i32> * noalias %out1,
+        <WIDTH x i32> * noalias %out2) nounwind alwaysinline { 
   %fbase = bitcast i32 * %base to float *
-  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
-  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
-  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
+  %fout0 = bitcast <WIDTH x i32> * %out0 to <WIDTH x float> *
+  %fout1 = bitcast <WIDTH x i32> * %out1 to <WIDTH x float> *
+  %fout2 = bitcast <WIDTH x i32> * %out2 to <WIDTH x float> *
   call void @__aos_to_soa3_float(float * %fbase,
-      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
+      <WIDTH x float> * %fout0, <WIDTH x float> * %fout1, <WIDTH x float> * %fout2)
   ret void
 }
 
 
 define void
-@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
+@__soa_to_aos3_float(<WIDTH x float> %v0, <WIDTH x float> %v1, <WIDTH x float> %v2,
                      float * noalias %p) nounwind alwaysinline { 
-  %out0 = bitcast float * %p to <$1 x float> *
-  %out1 = getelementptr <$1 x float> * %out0, i32 1
-  %out2 = getelementptr <$1 x float> * %out0, i32 2
-  call void @__soa_to_aos3_float$1(<$1 x float> %v0, <$1 x float> %v1, 
-         <$1 x float> %v2, <$1 x float> * %out0, <$1 x float> * %out1,
-         <$1 x float> * %out2)
+  %out0 = bitcast float * %p to <WIDTH x float> *
+  %out1 = getelementptr <WIDTH x float> * %out0, i32 1
+  %out2 = getelementptr <WIDTH x float> * %out0, i32 2
+  call void @__soa_to_aos3_float`'WIDTH (<WIDTH x float> %v0, <WIDTH x float> %v1, 
+         <WIDTH x float> %v2, <WIDTH x float> * %out0, <WIDTH x float> * %out1,
+         <WIDTH x float> * %out2)
   ret void
 }
 
 
 define void
-@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
+@__soa_to_aos3_int32(<WIDTH x i32> %v0, <WIDTH x i32> %v1, <WIDTH x i32> %v2,
                      i32 * noalias %base) nounwind alwaysinline { 
-  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
-  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
-  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
+  %fv0 = bitcast <WIDTH x i32> %v0 to <WIDTH x float>
+  %fv1 = bitcast <WIDTH x i32> %v1 to <WIDTH x float>
+  %fv2 = bitcast <WIDTH x i32> %v2 to <WIDTH x float>
   %fbase = bitcast i32 * %base to float *
-  call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, float * %fbase)
+  call void @__soa_to_aos3_float(<WIDTH x float> %fv0, <WIDTH x float> %fv1, 
+      <WIDTH x float> %fv2, float * %fbase)
   ret void
 }
 
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetching
-
-ifelse(LLVM_VERSION, `LLVM_2_9',
-`
-declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
-
-define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
-  ret void
-}
-
-define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
-  ret void
-}
-
-define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
-  ret void
-}
-', `
-declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
-                            i32 %cachetype) ; cachetype == 1 is dcache
-
-define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1)
-  ret void
-}
-
-define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
-  call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1)
-  ret void
-}
-')
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; assert
 
 declare i32 @printf(i8*, ...)
 declare void @abort() noreturn
 
-define void @__do_assert_uniform(i8 *%str, i1 %test, <$1 x i32> %mask) {
+define void @__do_assert_uniform(i8 *%str, i1 %test, <WIDTH x MASK> %mask) {
   br i1 %test, label %ok, label %fail
 
 fail:
@@ -1860,12 +1871,12 @@ ok:
 }
 
 
-define void @__do_assert_varying(i8 *%str, <$1 x i32> %test,
-                                          <$1 x i32> %mask) {
-  %nottest = xor <$1 x i32> %test,
-                 < forloop(i, 1, eval($1-1), `i32 -1, ') i32 -1 >
-  %nottest_and_mask = and <$1 x i32> %nottest, %mask
-  %mm = call i32 @__movmsk(<$1 x i32> %nottest_and_mask)
+define void @__do_assert_varying(i8 *%str, <WIDTH x MASK> %test,
+                                          <WIDTH x MASK> %mask) {
+  %nottest = xor <WIDTH x MASK> %test,
+                 < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 >
+  %nottest_and_mask = and <WIDTH x MASK> %nottest, %mask
+  %mm = call i32 @__movmsk(<WIDTH x MASK> %nottest_and_mask)
   %all_ok = icmp eq i32 %mm, 0
   br i1 %all_ok, label %ok, label %fail
 
@@ -2010,118 +2021,118 @@ define void @__memory_barrier() nounwind readnone alwaysinline {
   ret void
 }
 
-global_atomic_associative($1, add, i32, int32, 0)
-global_atomic_associative($1, sub, i32, int32, 0)
-global_atomic_associative($1, and, i32, int32, -1)
-global_atomic_associative($1, or, i32, int32, 0)
-global_atomic_associative($1, xor, i32, int32, 0)
-global_atomic_uniform($1, add, i32, int32)
-global_atomic_uniform($1, sub, i32, int32)
-global_atomic_uniform($1, and, i32, int32)
-global_atomic_uniform($1, or, i32, int32)
-global_atomic_uniform($1, xor, i32, int32)
-global_atomic_uniform($1, min, i32, int32)
-global_atomic_uniform($1, max, i32, int32)
-global_atomic_uniform($1, umin, i32, uint32)
-global_atomic_uniform($1, umax, i32, uint32)
+global_atomic_associative(WIDTH, add, i32, int32, 0)
+global_atomic_associative(WIDTH, sub, i32, int32, 0)
+global_atomic_associative(WIDTH, and, i32, int32, -1)
+global_atomic_associative(WIDTH, or, i32, int32, 0)
+global_atomic_associative(WIDTH, xor, i32, int32, 0)
+global_atomic_uniform(WIDTH, add, i32, int32)
+global_atomic_uniform(WIDTH, sub, i32, int32)
+global_atomic_uniform(WIDTH, and, i32, int32)
+global_atomic_uniform(WIDTH, or, i32, int32)
+global_atomic_uniform(WIDTH, xor, i32, int32)
+global_atomic_uniform(WIDTH, min, i32, int32)
+global_atomic_uniform(WIDTH, max, i32, int32)
+global_atomic_uniform(WIDTH, umin, i32, uint32)
+global_atomic_uniform(WIDTH, umax, i32, uint32)
 
-global_atomic_associative($1, add, i64, int64, 0)
-global_atomic_associative($1, sub, i64, int64, 0)
-global_atomic_associative($1, and, i64, int64, -1)
-global_atomic_associative($1, or, i64, int64, 0)
-global_atomic_associative($1, xor, i64, int64, 0)
-global_atomic_uniform($1, add, i64, int64)
-global_atomic_uniform($1, sub, i64, int64)
-global_atomic_uniform($1, and, i64, int64)
-global_atomic_uniform($1, or, i64, int64)
-global_atomic_uniform($1, xor, i64, int64)
-global_atomic_uniform($1, min, i64, int64)
-global_atomic_uniform($1, max, i64, int64)
-global_atomic_uniform($1, umin, i64, uint64)
-global_atomic_uniform($1, umax, i64, uint64)
+global_atomic_associative(WIDTH, add, i64, int64, 0)
+global_atomic_associative(WIDTH, sub, i64, int64, 0)
+global_atomic_associative(WIDTH, and, i64, int64, -1)
+global_atomic_associative(WIDTH, or, i64, int64, 0)
+global_atomic_associative(WIDTH, xor, i64, int64, 0)
+global_atomic_uniform(WIDTH, add, i64, int64)
+global_atomic_uniform(WIDTH, sub, i64, int64)
+global_atomic_uniform(WIDTH, and, i64, int64)
+global_atomic_uniform(WIDTH, or, i64, int64)
+global_atomic_uniform(WIDTH, xor, i64, int64)
+global_atomic_uniform(WIDTH, min, i64, int64)
+global_atomic_uniform(WIDTH, max, i64, int64)
+global_atomic_uniform(WIDTH, umin, i64, uint64)
+global_atomic_uniform(WIDTH, umax, i64, uint64)
 
-global_swap($1, i32, int32)
-global_swap($1, i64, int64)
+global_swap(WIDTH, i32, int32)
+global_swap(WIDTH, i64, int64)
 
-define <$1 x float> @__atomic_swap_float_global(float * %ptr, <$1 x float> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x float> @__atomic_swap_float_global(float * %ptr, <WIDTH x float> %val,
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
-  %ival = bitcast <$1 x float> %val to <$1 x i32>
-  %iret = call <$1 x i32> @__atomic_swap_int32_global(i32 * %iptr, <$1 x i32> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i32> %iret to <$1 x float>
-  ret <$1 x float> %ret
+  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
+  %iret = call <WIDTH x i32> @__atomic_swap_int32_global(i32 * %iptr, <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
+  ret <WIDTH x float> %ret
 }
 
-define <$1 x double> @__atomic_swap_double_global(double * %ptr, <$1 x double> %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x double> @__atomic_swap_double_global(double * %ptr, <WIDTH x double> %val,
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
-  %ival = bitcast <$1 x double> %val to <$1 x i64>
-  %iret = call <$1 x i64> @__atomic_swap_int64_global(i64 * %iptr, <$1 x i64> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i64> %iret to <$1 x double>
-  ret <$1 x double> %ret
+  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
+  %iret = call <WIDTH x i64> @__atomic_swap_int64_global(i64 * %iptr, <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
+  ret <WIDTH x double> %ret
 }
 
 define float @__atomic_swap_uniform_float_global(float * %ptr, float %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %ival = bitcast float %val to i32
-  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <$1 x i32> %mask)
+  %iret = call i32 @__atomic_swap_uniform_int32_global(i32 * %iptr, i32 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
 define double @__atomic_swap_uniform_double_global(double * %ptr, double %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %ival = bitcast double %val to i64
-  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <$1 x i32> %mask)
+  %iret = call i64 @__atomic_swap_uniform_int64_global(i64 * %iptr, i64 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
 
-global_atomic_exchange($1, i32, int32)
-global_atomic_exchange($1, i64, int64)
+global_atomic_exchange(WIDTH, i32, int32)
+global_atomic_exchange(WIDTH, i64, int64)
 
-define <$1 x float> @__atomic_compare_exchange_float_global(float * %ptr,
-                      <$1 x float> %cmp, <$1 x float> %val, <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x float> @__atomic_compare_exchange_float_global(float * %ptr,
+                      <WIDTH x float> %cmp, <WIDTH x float> %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
-  %icmp = bitcast <$1 x float> %cmp to <$1 x i32>
-  %ival = bitcast <$1 x float> %val to <$1 x i32>
-  %iret = call <$1 x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <$1 x i32> %icmp,
-                                                                  <$1 x i32> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i32> %iret to <$1 x float>
-  ret <$1 x float> %ret
+  %icmp = bitcast <WIDTH x float> %cmp to <WIDTH x i32>
+  %ival = bitcast <WIDTH x float> %val to <WIDTH x i32>
+  %iret = call <WIDTH x i32> @__atomic_compare_exchange_int32_global(i32 * %iptr, <WIDTH x i32> %icmp,
+                                                                  <WIDTH x i32> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i32> %iret to <WIDTH x float>
+  ret <WIDTH x float> %ret
 }
 
-define <$1 x double> @__atomic_compare_exchange_double_global(double * %ptr,
-                      <$1 x double> %cmp, <$1 x double> %val, <$1 x i32> %mask) nounwind alwaysinline {
+define <WIDTH x double> @__atomic_compare_exchange_double_global(double * %ptr,
+                      <WIDTH x double> %cmp, <WIDTH x double> %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
-  %icmp = bitcast <$1 x double> %cmp to <$1 x i64>
-  %ival = bitcast <$1 x double> %val to <$1 x i64>
-  %iret = call <$1 x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <$1 x i64> %icmp,
-                                                                  <$1 x i64> %ival, <$1 x i32> %mask)
-  %ret = bitcast <$1 x i64> %iret to <$1 x double>
-  ret <$1 x double> %ret
+  %icmp = bitcast <WIDTH x double> %cmp to <WIDTH x i64>
+  %ival = bitcast <WIDTH x double> %val to <WIDTH x i64>
+  %iret = call <WIDTH x i64> @__atomic_compare_exchange_int64_global(i64 * %iptr, <WIDTH x i64> %icmp,
+                                                                  <WIDTH x i64> %ival, <WIDTH x MASK> %mask)
+  %ret = bitcast <WIDTH x i64> %iret to <WIDTH x double>
+  ret <WIDTH x double> %ret
 }
 
 define float @__atomic_compare_exchange_uniform_float_global(float * %ptr, float %cmp, float %val,
-                                                   <$1 x i32> %mask) nounwind alwaysinline {
+                                                   <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast float * %ptr to i32 *
   %icmp = bitcast float %cmp to i32
   %ival = bitcast float %val to i32
   %iret = call i32 @__atomic_compare_exchange_uniform_int32_global(i32 * %iptr, i32 %icmp,
-                                                                   i32 %ival, <$1 x i32> %mask)
+                                                                   i32 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i32 %iret to float
   ret float %ret
 }
 
 define double @__atomic_compare_exchange_uniform_double_global(double * %ptr, double %cmp,
-                                            double %val, <$1 x i32> %mask) nounwind alwaysinline {
+                                            double %val, <WIDTH x MASK> %mask) nounwind alwaysinline {
   %iptr = bitcast double * %ptr to i64 *
   %icmp = bitcast double %cmp to i64
   %ival = bitcast double %val to i64
   %iret = call i64 @__atomic_compare_exchange_uniform_int64_global(i64 * %iptr, i64 %icmp,
-                                                                   i64 %ival, <$1 x i32> %mask)
+                                                                   i64 %ival, <WIDTH x MASK> %mask)
   %ret = bitcast i64 %iret to double
   ret double %ret
 }
@@ -2168,10 +2179,10 @@ define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline
 ;; vector width as a parameter
 
 define(`int64minmax', `
-i64minmax($1,min,int64,slt)
-i64minmax($1,max,int64,sgt)
-i64minmax($1,min,uint64,ult)
-i64minmax($1,max,uint64,ugt)
+i64minmax(WIDTH,min,int64,slt)
+i64minmax(WIDTH,max,int64,sgt)
+i64minmax(WIDTH,min,uint64,ult)
+i64minmax(WIDTH,max,uint64,ugt)
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -2410,24 +2421,24 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
 
 define(`packed_load_and_store', `
 
-define i32 @__packed_load_active(i32 * %startptr, <$1 x i32> * %val_ptr,
-                                 <$1 x i32> %full_mask) nounwind alwaysinline {
+define i32 @__packed_load_active(i32 * %startptr, <WIDTH x i32> * %val_ptr,
+                                 <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
   ;; everyone wants to load, so just load an entire vector width in a single
   ;; vector load
-  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
-  %vec_load = load <$1 x i32> *%vecptr, align 4
-  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
-  ret i32 $1
+  %vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
+  %vec_load = load <WIDTH x i32> *%vecptr, align 4
+  store <WIDTH x i32> %vec_load, <WIDTH x i32> * %val_ptr, align 4
+  ret i32 WIDTH
 
 unknown_mask:
   br label %loop
@@ -2445,7 +2456,7 @@ loop:
 load:
   %loadptr = getelementptr i32 *%startptr, i32 %offset
   %loadval = load i32 *%loadptr
-  %val_ptr_i32 = bitcast <$1 x i32> * %val_ptr to i32 *
+  %val_ptr_i32 = bitcast <WIDTH x i32> * %val_ptr to i32 *
   %storeptr = getelementptr i32 *%val_ptr_i32, i32 %lane
   store i32 %loadval, i32 *%storeptr
   %offset1 = add i32 %offset, 1
@@ -2457,28 +2468,28 @@ loopend:
   %nextlanemask = mul i32 %lanemask, 2
 
   ; are we done yet?
-  %test = icmp ne i32 %nextlane, $1
+  %test = icmp ne i32 %nextlane, WIDTH
   br i1 %test, label %loop, label %done
 
 done:
   ret i32 %nextoffset
 }
 
-define i32 @__packed_store_active(i32 * %startptr, <$1 x i32> %vals,
-                                  <$1 x i32> %full_mask) nounwind alwaysinline {
+define i32 @__packed_store_active(i32 * %startptr, <WIDTH x i32> %vals,
+                                  <WIDTH x i32> %full_mask) nounwind alwaysinline {
 entry:
-  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
+  %mask = call i32 @__movmsk(<WIDTH x i32> %full_mask)
+  %mask_known = call i1 @__is_compile_time_constant_mask(<WIDTH x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
 
 known_mask:
-  %allon = icmp eq i32 %mask, eval((1 << $1) -1)
+  %allon = icmp eq i32 %mask, eval((1 << WIDTH) -1)
   br i1 %allon, label %all_on, label %unknown_mask
 
 all_on:
-  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
-  store <$1 x i32> %vals, <$1 x i32> * %vecptr, align 4
-  ret i32 $1
+  %vecptr = bitcast i32 *%startptr to <WIDTH x i32> *
+  store <WIDTH x i32> %vals, <WIDTH x i32> * %vecptr, align 4
+  ret i32 WIDTH
 
 unknown_mask:
   br label %loop
@@ -2494,7 +2505,7 @@ loop:
   br i1 %do_store, label %store, label %loopend 
 
 store:
-  %storeval = extractelement <$1 x i32> %vals, i32 %lane
+  %storeval = extractelement <WIDTH x i32> %vals, i32 %lane
   %storeptr = getelementptr i32 *%startptr, i32 %offset
   store i32 %storeval, i32 *%storeptr
   %offset1 = add i32 %offset, 1
@@ -2506,7 +2517,7 @@ loopend:
   %nextlanemask = mul i32 %lanemask, 2
 
   ; are we done yet?
-  %test = icmp ne i32 %nextlane, $1
+  %test = icmp ne i32 %nextlane, WIDTH
   br i1 %test, label %loop, label %done
 
 done:
@@ -2613,7 +2624,7 @@ reduce_equal_aux($1, double, double, i64, fcmp, 64)
 
 define(`exclusive_scan', `
 define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
-                                  <$1 x i32> %mask) nounwind alwaysinline {
+                                  <$1 x MASK> %mask) nounwind alwaysinline {
   ; first, set the value of any off lanes to the identity value
   %ptr = alloca <$1 x $2>
   %idvec1 = bitcast $2 $5 to <1 x $2>
@@ -2623,7 +2634,7 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
   %ptr`'$3 = bitcast <$1 x $2> * %ptr to <$1 x i`'$3> *
   %vi = bitcast <$1 x $2> %v to <$1 x i`'$3>
   call void @__masked_store_blend_$3(<$1 x i`'$3> * %ptr`'$3, <$1 x i`'$3> %vi,
-                                     <$1 x i32> %mask)
+                                     <$1 x MASK> %mask)
   %v_id = load <$1 x $2> * %ptr
 
   ; extract elements of the vector to use in computing the scan
@@ -2649,16 +2660,16 @@ define <$1 x $2> @__exclusive_scan_$6(<$1 x $2> %v,
 ')
 
 define(`scans', `
-exclusive_scan($1, i32, 32, add, 0, add_i32)
-exclusive_scan($1, float, 32, fadd, zeroinitializer, add_float)
-exclusive_scan($1, i64, 64, add, 0, add_i64)
-exclusive_scan($1, double, 64, fadd, zeroinitializer, add_double)
+exclusive_scan(WIDTH, i32, 32, add, 0, add_i32)
+exclusive_scan(WIDTH, float, 32, fadd, zeroinitializer, add_float)
+exclusive_scan(WIDTH, i64, 64, add, 0, add_i64)
+exclusive_scan(WIDTH, double, 64, fadd, zeroinitializer, add_double)
 
-exclusive_scan($1, i32, 32, and, -1, and_i32)
-exclusive_scan($1, i64, 64, and, -1, and_i64)
+exclusive_scan(WIDTH, i32, 32, and, -1, and_i32)
+exclusive_scan(WIDTH, i64, 64, and, -1, and_i64)
 
-exclusive_scan($1, i32, 32, or, 0, or_i32)
-exclusive_scan($1, i64, 64, or, 0, or_i64)
+exclusive_scan(WIDTH, i32, 32, or, 0, or_i32)
+exclusive_scan(WIDTH, i64, 64, or, 0, or_i64)
 ')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/ctx.cpp b/ctx.cpp
index 043f7acc..694a3b1d 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -875,8 +875,11 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     // into an i32 value
     std::vector<Symbol *> mm;
     m->symbolTable->LookupFunction("__movmsk", &mm);
-    // There should be one with signed int signature, one unsigned int.
-    Assert(mm.size() == 2); 
+    if (g->target.maskBitCount == 1)
+        Assert(mm.size() == 1);
+    else
+        // There should be one with signed int signature, one unsigned int.
+        Assert(mm.size() == 2); 
     // We can actually call either one, since both are i32s as far as
     // LLVM's type system is concerned...
     llvm::Function *fmm = mm[0]->function;
@@ -929,6 +932,9 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
         return NULL;
     }
 
+    if (g->target.maskBitCount == 1)
+        return b;
+
     LLVM_TYPE_CONST llvm::ArrayType *at = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
     if (at) {
diff --git a/func.cpp b/func.cpp
index 61dfb784..4c8d2222 100644
--- a/func.cpp
+++ b/func.cpp
@@ -288,7 +288,10 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         bool checkMask = (type->isTask == true) || 
             ((function->hasFnAttr(llvm::Attribute::AlwaysInline) == false) &&
              costEstimate > CHECK_MASK_AT_FUNCTION_START_COST);
-        if (checkMask && g->opt.disableCoherentControlFlow == false) {
+        checkMask &= (g->target.maskingIsFree == false);
+        checkMask &= (g->opt.disableCoherentControlFlow == false);
+
+        if (checkMask) {
             llvm::Value *mask = ctx->GetFunctionMask();
             llvm::Value *allOn = ctx->All(mask);
             llvm::BasicBlock *bbAllOn = ctx->CreateBasicBlock("all_on");
diff --git a/ispc.cpp b/ispc.cpp
index 8bfc9a9d..8cc618c3 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -129,24 +129,60 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse2-x2")) {
         t->isa = Target::SSE2;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,-sse3,-sse41,-sse42,-sse4a,-ssse3,-popcnt";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4")) {
         t->isa = Target::SSE4;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 4;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "sse4x2") || !strcasecmp(isa, "sse4-x2")) {
         t->isa = Target::SSE4;
         t->nativeVectorWidth = 4;
         t->vectorWidth = 8;
         t->attributes = "+sse,+sse2,+sse3,+sse41,-sse42,-sse4a,+ssse3,-popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
+    }
+    else if (!strcasecmp(isa, "generic-4")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 4;
+        t->vectorWidth = 4;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-8")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 8;
+        t->vectorWidth = 8;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
+    else if (!strcasecmp(isa, "generic-16")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 16;
+        t->vectorWidth = 16;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
     }
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
     else if (!strcasecmp(isa, "avx")) {
@@ -154,12 +190,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "avx-x2")) {
         t->isa = Target::AVX;
         t->nativeVectorWidth = 8;
         t->vectorWidth = 16;
         t->attributes = "+avx,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
 #endif // LLVM 3.0+
 #if defined(LLVM_3_1svn)
@@ -168,12 +210,18 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->nativeVectorWidth = 8;
         t->vectorWidth = 8;
         t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
     else if (!strcasecmp(isa, "avx2-x2")) {
         t->isa = Target::AVX2;
         t->nativeVectorWidth = 16;
         t->vectorWidth = 16;
         t->attributes = "+avx2,+popcnt,+cmov";
+        t->maskingIsFree = false;
+        t->allOffMaskIsSafe = false;
+        t->maskBitCount = 32;
     }
 #endif // LLVM 3.1
     else {
@@ -221,7 +269,7 @@ Target::SupportedTargetISAs() {
 #ifdef LLVM_3_1svn
         ", avx2, avx2-x2"
 #endif // LLVM_3_1svn
-        ;
+        ", generic-4, generic-8, generic-16";
 }
 
 
@@ -300,6 +348,8 @@ Target::GetISAString() const {
         return "avx";
     case Target::AVX2:
         return "avx2";
+    case Target::GENERIC:
+        return "generic";
     default:
         FATAL("Unhandled target in GetISAString()");
     }
diff --git a/ispc.h b/ispc.h
index 6eb2cdd9..254c8311 100644
--- a/ispc.h
+++ b/ispc.h
@@ -193,7 +193,7 @@ struct Target {
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { SSE2, SSE4, AVX, AVX2, NUM_ISAS };
+    enum ISA { SSE2, SSE4, AVX, AVX2, GENERIC, NUM_ISAS };
 
     /** Instruction set being compiled to. */
     ISA isa;
@@ -222,6 +222,23 @@ struct Target {
 
     /** Indicates whether position independent code should be generated. */
     bool generatePIC;
+
+    /** Is there overhead associated with masking on the target
+        architecture; e.g. there is on SSE, due to extra blends and the
+        like, but there isn't with an ISA that supports masking
+        natively. */
+    bool maskingIsFree;
+
+    /** Is it safe to run code with the mask all if: e.g. on SSE, the fast
+        gather trick assumes that at least one program instance is running
+        (so that it can safely assume that the array base pointer is
+        valid). */
+    bool allOffMaskIsSafe;
+
+    /** How many bits are used to store each element of the mask: e.g. this
+        is 32 on SSE/AVX, since that matches the HW better, but it's 1 for
+        the generic target. */
+    int maskBitCount;
 };
 
 
diff --git a/ispc.vcxproj b/ispc.vcxproj
index fb56b96c..96a6855d 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -22,11 +22,15 @@
     <ClCompile Include="gen-bitcode-c-32.cpp" />
     <ClCompile Include="gen-bitcode-c-64.cpp" />
     <ClCompile Include="gen-bitcode-dispatch.cpp" />
+    <ClCompile Include="gen-bitcode-generic-4.cpp" />
+    <ClCompile Include="gen-bitcode-generic-8.cpp" />
+    <ClCompile Include="gen-bitcode-generic-16.cpp" />
     <ClCompile Include="gen-bitcode-sse2.cpp" />
     <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
     <ClCompile Include="gen-bitcode-sse4.cpp" />
     <ClCompile Include="gen-bitcode-sse4-x2.cpp" />
-    <ClCompile Include="gen-stdlib.cpp" />
+    <ClCompile Include="gen-stdlib-generic.cpp" />
+    <ClCompile Include="gen-stdlib-x86.cpp" />
     <ClCompile Include="ispc.cpp" />
     <ClCompile Include="lex.cc">
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4003;4018</DisableSpecificWarnings>
@@ -40,15 +44,15 @@
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
       <DisableSpecificWarnings Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">4146;4800;4996;4355;4624;4005;4065</DisableSpecificWarnings>
     </ClCompile>
-    <CustomBuild Include="builtins-c.c">
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">clang builtins-c.c</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-32.c &gt; gen-bitcode-c-32.cpp;
-%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py builtins-c-64.c &gt; gen-bitcode-c-64.cpp</Command>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">clang builtins-c.c</Message>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcore-c-64.cpp</Outputs>
+    <CustomBuild Include="builtins\builtins.c">
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building builtins.c</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -m32 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-32 &gt; gen-bitcode-c-32.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -m64 -emit-llvm builtins\builtins.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py c-64 &gt; gen-bitcode-c-64.cpp</Command>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building builtins.c</Message>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-c-32.cpp;gen-bitcode-c-64.cpp</Outputs>
     </CustomBuild>
     <ClCompile Include="stmt.cpp" />
     <ClCompile Include="sym.cpp" />
@@ -75,105 +79,148 @@
   <ItemGroup>
     <CustomBuild Include="stdlib.ispc">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib.cpp</Outputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py &gt; gen-stdlib.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib.cpp</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib.cpp</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DPI=3.1415926535 | python stdlib2cpp.py x86 &gt; gen-stdlib-x86.cpp;
+%LLVM_INSTALL_DIR%\bin\clang -E -x c %(Filename).ispc -DISPC=1 -DISPC_TARGET_GENERIC=1 -DPI=3.1415926535 | python stdlib2cpp.py generic &gt; gen-stdlib-generic.cpp;
+</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-stdlib-generic.cpp;gen-stdlib-x86.cpp</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-stdlib-{generic,x86}.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse4.ll">
+    <CustomBuild Include="builtins\dispatch.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4.ll | python bitcode2cpp.py builtins-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins-dispatch.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-dispatch.ll | python bitcode2cpp.py builtins-dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\dispatch.ll | python bitcode2cpp.py dispatch.ll &gt; gen-bitcode-dispatch.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-dispatch.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-dispatch.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-dispatch.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse4-x2.ll">
+    <CustomBuild Include="builtins\target-sse4.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4.ll | python bitcode2cpp.py builtins\target-sse4.ll &gt; gen-bitcode-sse4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-sse4-x2.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse4-x2.ll | python bitcode2cpp.py builtins-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse4-x2.ll | python bitcode2cpp.py builtins\target-sse4-x2.ll &gt; gen-bitcode-sse4-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse4-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse4-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse4-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse4-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse2.ll">
+    <CustomBuild Include="builtins\target-sse2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2.ll | python bitcode2cpp.py builtins-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2.ll | python bitcode2cpp.py builtins\target-sse2.ll &gt; gen-bitcode-sse2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-sse2-x2.ll">
+    <CustomBuild Include="builtins\target-sse2-x2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-sse2-x2.ll | python bitcode2cpp.py builtins-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-sse2-x2.ll | python bitcode2cpp.py builtins\target-sse2-x2.ll &gt; gen-bitcode-sse2-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-sse2-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse2-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-sse2-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-sse2-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-avx.ll">
+    <CustomBuild Include="builtins\target-avx.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx.ll | python bitcode2cpp.py builtins-avx.ll &gt; gen-bitcode-avx.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx.ll | python bitcode2cpp.py builtins\target-avx.ll &gt; gen-bitcode-avx.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-avx-common.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx.cpp</Message>
     </CustomBuild>
   </ItemGroup>
   <ItemGroup>
-    <CustomBuild Include="builtins-avx-x2.ll">
+    <CustomBuild Include="builtins\target-avx-x2.ll">
       <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 builtins.m4 builtins-avx-x2.ll | python bitcode2cpp.py builtins-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-avx-x2.ll | python bitcode2cpp.py builtins\target-avx-x2.ll &gt; gen-bitcode-avx-x2.cpp</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-avx-x2.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins.m4;builtins-sse.ll</AdditionalInputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-avx-common.ll</AdditionalInputs>
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-avx-x2.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-4.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-4.ll | python bitcode2cpp.py builtins\target-generic-4.ll &gt; gen-bitcode-generic-4.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-4.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-4.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-8.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-8.ll | python bitcode2cpp.py builtins\target-generic-8.ll &gt; gen-bitcode-generic-8.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-8.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-8.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-16.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-16.ll | python bitcode2cpp.py builtins\target-generic-16.ll &gt; gen-bitcode-generic-16.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-16.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 6c440a91..4a50e337 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -105,11 +105,14 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
-    // Note that both the mask and bool vectors are vector of int32s
-    // (not i1s).  LLVM ends up generating much better SSE code with
-    // this representation.
-    LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
-        llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    if (target.maskBitCount == 1)
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    else {
+        assert(target.maskBitCount == 32);
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth);
+    }
 
     LLVMTypes::Int1VectorType = 
         llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
@@ -141,7 +144,11 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 
     std::vector<llvm::Constant *> maskOnes;
     llvm::Constant *onMask = NULL;
-    onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
+    if (target.maskBitCount == 1)
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 1,
+                                        false /*unsigned*/); // 0x1
+    else
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
 
     for (int i = 0; i < target.vectorWidth; ++i)
@@ -150,8 +157,12 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 
     std::vector<llvm::Constant *> maskZeros;
     llvm::Constant *offMask = NULL;
-    offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
-                                     true /*signed*/);
+    if (target.maskBitCount == 1)
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt1Ty(*ctx), 0,
+                                         true /*signed*/);
+    else
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
+                                         true /*signed*/);
 
     for (int i = 0; i < target.vectorWidth; ++i)
         maskZeros.push_back(offMask);
diff --git a/module.cpp b/module.cpp
index 9fade4b9..5dc9b160 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1158,22 +1158,14 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
     opts.addMacroDef("PI=3.1415926535");
 
     // Add #define for current compilation target
-    switch (g->target.isa) {
-    case Target::SSE2:
-        opts.addMacroDef("ISPC_TARGET_SSE2");
-        break;
-    case Target::SSE4:
-        opts.addMacroDef("ISPC_TARGET_SSE4");
-        break;
-    case Target::AVX:
-        opts.addMacroDef("ISPC_TARGET_AVX");
-        break;
-    case Target::AVX2:
-        opts.addMacroDef("ISPC_TARGET_AVX2");
-        break;
-    default:
-        FATAL("Unhandled target ISA in preprocessor symbol definition");
+    char targetMacro[128];
+    sprintf(targetMacro, "ISPC_TARGET_%s", g->target.GetISAString());
+    char *p = targetMacro;
+    while (*p) {
+        *p = toupper(*p);
+        ++p;
     }
+    opts.addMacroDef(targetMacro);
 
     if (g->target.is32Bit)
         opts.addMacroDef("ISPC_POINTER_SIZE=32");
diff --git a/opt.cpp b/opt.cpp
index c77a76f7..17458a06 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -2444,7 +2444,7 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     int count = sizeof(names) / sizeof(names[0]);
     for (int i = 0; i < count; ++i) {
         llvm::Function *f = m->module->getFunction(names[i]);
-        if (f != NULL) {
+        if (f != NULL && f->empty() == false) {
             f->setLinkage(llvm::GlobalValue::InternalLinkage);
             modifiedAny = true;
         }
diff --git a/parse.yy b/parse.yy
index 8510244a..70cb2b3f 100644
--- a/parse.yy
+++ b/parse.yy
@@ -1605,7 +1605,8 @@ lAddFunctionParams(Declarator *decl) {
 
 /** Add a symbol for the built-in mask variable to the symbol table */
 static void lAddMaskToSymbolTable(SourcePos pos) {
-    const Type *t = AtomicType::VaryingConstUInt32;
+    const Type *t = g->target.isa == Target::GENERIC ?
+        AtomicType::VaryingConstBool : AtomicType::VaryingConstUInt32;
     Symbol *maskSymbol = new Symbol("__mask", pos, t);
     m->symbolTable->AddVariable(maskSymbol);
 }
diff --git a/stdlib.ispc b/stdlib.ispc
index 1a804733..c3b02fa7 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -38,6 +38,14 @@
            ispc code 
 */
 
+#ifdef ISPC_TARGET_GENERIC
+#define IntMaskType bool
+#define UIntMaskType bool
+#else
+#define IntMaskType int32
+#define UIntMaskType unsigned int32
+#endif
+
 ///////////////////////////////////////////////////////////////////////////
 // Low level primitives
 
@@ -274,13 +282,21 @@ static inline int32 sign_extend(bool v) {
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
+#ifdef ISPC_TARGET_GENERIC
+    return __movmsk(v & __mask) != 0;
+#else
     return __movmsk(__sext_varying_bool(v) & __mask) != 0;
+#endif
 }
 
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
+#ifdef ISPC_TARGET_GENERIC
+    bool match = ((v & __mask) == __mask);
+#else
     int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
+#endif
     return __movmsk(match) == (1 << programCount) - 1;
 }
 
@@ -308,7 +324,11 @@ static inline int popcnt(int64 v) {
 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
+#ifdef ISPC_TARGET_GENERIC
+    return __popcnt_int32(__movmsk(v & __mask));
+#else
     return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
+#endif
 }
 
 static inline uniform int lanemask() {
@@ -672,19 +692,19 @@ static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) {
     return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }
 
-REDUCE_EQUAL(int32, int32, int32)
-REDUCE_EQUAL(unsigned int32, int32, unsigned int32)
-REDUCE_EQUAL(float, float, int32)
-REDUCE_EQUAL(int64, int64, int32)
-REDUCE_EQUAL(unsigned int64, int64, unsigned int32)
-REDUCE_EQUAL(double, double, int32)
+REDUCE_EQUAL(int32, int32, IntMaskType)
+REDUCE_EQUAL(unsigned int32, int32, UIntMaskType)
+REDUCE_EQUAL(float, float, IntMaskType)
+REDUCE_EQUAL(int64, int64, IntMaskType)
+REDUCE_EQUAL(unsigned int64, int64, UIntMaskType)
+REDUCE_EQUAL(double, double, IntMaskType)
 
 static int32 exclusive_scan_add(int32 v) {
-    return __exclusive_scan_add_i32(v, (int32)__mask);
+    return __exclusive_scan_add_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_add(unsigned int32 v) {
-    return __exclusive_scan_add_i32(v, __mask);
+    return __exclusive_scan_add_i32((int32)v, (IntMaskType)__mask);
 }
 
 static float exclusive_scan_add(float v) {
@@ -692,11 +712,11 @@ static float exclusive_scan_add(float v) {
 }
 
 static int64 exclusive_scan_add(int64 v) {
-    return __exclusive_scan_add_i64(v, (int32)__mask);
+    return __exclusive_scan_add_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_add(unsigned int64 v) {
-    return __exclusive_scan_add_i64(v, __mask);
+    return __exclusive_scan_add_i64(v, (UIntMaskType)__mask);
 }
 
 static double exclusive_scan_add(double v) {
@@ -704,35 +724,35 @@ static double exclusive_scan_add(double v) {
 }
 
 static int32 exclusive_scan_and(int32 v) {
-    return __exclusive_scan_and_i32(v, (int32)__mask);
+    return __exclusive_scan_and_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_and(unsigned int32 v) {
-    return __exclusive_scan_and_i32(v, __mask);
+    return __exclusive_scan_and_i32(v, (UIntMaskType)__mask);
 }
 
 static int64 exclusive_scan_and(int64 v) {
-    return __exclusive_scan_and_i64(v, (int32)__mask);
+    return __exclusive_scan_and_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_and(unsigned int64 v) {
-    return __exclusive_scan_and_i64(v, __mask);
+    return __exclusive_scan_and_i64(v, (UIntMaskType)__mask);
 }
 
 static int32 exclusive_scan_or(int32 v) {
-    return __exclusive_scan_or_i32(v, (int32)__mask);
+    return __exclusive_scan_or_i32(v, (IntMaskType)__mask);
 }
 
 static unsigned int32 exclusive_scan_or(unsigned int32 v) {
-    return __exclusive_scan_or_i32(v, __mask);
+    return __exclusive_scan_or_i32(v, (UIntMaskType)__mask);
 }
 
 static int64 exclusive_scan_or(int64 v) {
-    return __exclusive_scan_or_i64(v, (int32)__mask);
+    return __exclusive_scan_or_i64(v, (IntMaskType)__mask);
 }
 
 static unsigned int64 exclusive_scan_or(unsigned int64 v) {
-    return __exclusive_scan_or_i64(v, __mask);
+    return __exclusive_scan_or_i64(v, (UIntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -741,23 +761,23 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int * uniform a,
                    unsigned int * uniform vals) {
-    return __packed_load_active(a, vals, (unsigned int32)__mask);
+    return __packed_load_active(a, vals, (UIntMaskType)__mask);
 }
 
 static inline uniform int
 packed_store_active(uniform unsigned int * uniform a,
                     unsigned int vals) {
-    return __packed_store_active(a, vals, (unsigned int32)__mask);
+    return __packed_store_active(a, vals, (UIntMaskType)__mask);
 }
 
 static inline uniform int 
 packed_load_active(uniform int * uniform a, int * uniform vals) {
-    return __packed_load_active(a, vals, (int32)__mask);
+    return __packed_load_active(a, vals, (IntMaskType)__mask);
 }
 
 static inline uniform int 
 packed_store_active(uniform int * uniform a, int vals) {
-    return __packed_store_active(a, vals, (int32)__mask);
+    return __packed_store_active(a, vals, (IntMaskType)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -848,49 +868,49 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr,        \
     return ret;                                                         \
 }
 
-DEFINE_ATOMIC_OP(int32,int32,add,add,int32)
-DEFINE_ATOMIC_OP(int32,int32,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,int32)
-DEFINE_ATOMIC_OP(int32,int32,and,and,int32)
-DEFINE_ATOMIC_OP(int32,int32,or,or,int32)
-DEFINE_ATOMIC_OP(int32,int32,xor,xor,int32)
-DEFINE_ATOMIC_OP(int32,int32,swap,swap,int32)
+DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int32,int32,swap,swap,IntMaskType)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int32,int32,swap,swap,UIntMaskType)
 
-DEFINE_ATOMIC_OP(float,float,swap,swap,int32)
+DEFINE_ATOMIC_OP(float,float,swap,swap,IntMaskType)
 
-DEFINE_ATOMIC_OP(int64,int64,add,add,int32)
-DEFINE_ATOMIC_OP(int64,int64,subtract,sub,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,int32)
-DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,int32)
-DEFINE_ATOMIC_OP(int64,int64,and,and,int32)
-DEFINE_ATOMIC_OP(int64,int64,or,or,int32)
-DEFINE_ATOMIC_OP(int64,int64,xor,xor,int32)
-DEFINE_ATOMIC_OP(int64,int64,swap,swap,int32)
+DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
+DEFINE_ATOMIC_OP(int64,int64,swap,swap,IntMaskType)
 
 // For everything but atomic min and max, we can use the same
 // implementations for unsigned as for signed.
-DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,unsigned int32)
-DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,unsigned int32)
-DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,unsigned int32)
+DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType)
+DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
+DEFINE_ATOMIC_OP(unsigned int64,int64,swap,swap,UIntMaskType)
 
-DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
+DEFINE_ATOMIC_OP(double,double,swap,swap,IntMaskType)
 
 #undef DEFINE_ATOMIC_OP
 
@@ -913,12 +933,12 @@ static inline uniform TA atomic_compare_exchange_global(               \
     return ret;                                                         \
 }
 
-ATOMIC_DECL_CMPXCHG(int32, int32, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int32, int32, unsigned int32)
-ATOMIC_DECL_CMPXCHG(float, float, int32)
-ATOMIC_DECL_CMPXCHG(int64, int64, int32)
-ATOMIC_DECL_CMPXCHG(unsigned int64, int64, unsigned int32)
-ATOMIC_DECL_CMPXCHG(double, double, int32)
+ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
+ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
+ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
+ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
 
 #undef ATOMIC_DECL_CMPXCHG
 
diff --git a/stdlib2cpp.py b/stdlib2cpp.py
index 132f8257..6fa5fc2e 100755
--- a/stdlib2cpp.py
+++ b/stdlib2cpp.py
@@ -2,7 +2,9 @@
 
 import sys
 
-print "char stdlib_code[] = { "
+t=str(sys.argv[1])
+
+print "char stdlib_" + t + "_code[] = { "
 
 for line in sys.stdin:
     for c in line:
diff --git a/stmt.cpp b/stmt.cpp
index e799fc0b..95142abe 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -622,9 +622,6 @@ IfStmt::emitMaskedTrueAndFalse(FunctionEmitContext *ctx, llvm::Value *oldMask,
 
 /** Given an AST node, check to see if it's safe if we happen to run the
     code for that node with the execution mask all off.
-
-    FIXME: this is actually a target-specific thing; for non SSE/AVX
-    targets with more complete masking support, some of this won't apply...
  */
 static bool
 lCheckAllOffSafety(ASTNode *node, void *data) {
@@ -648,6 +645,11 @@ lCheckAllOffSafety(ASTNode *node, void *data) {
         return false;
     }
 
+    if (g->target.allOffMaskIsSafe == true)
+        // Don't worry about memory accesses if we have a target that can
+        // safely run them with the mask all off
+        return true;
+
     IndexExpr *ie;
     if ((ie = dynamic_cast<IndexExpr *>(node)) != NULL && ie->baseExpr != NULL) {
         const Type *type = ie->baseExpr->GetType();