diff --git a/Makefile b/Makefile
index ca55a734..01746fa4 100644
--- a/Makefile
+++ b/Makefile
@@ -85,7 +85,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
 TARGETS=avx1 avx1-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 generic-4 generic-8 \
-	generic-16 generic-1
+	generic-16 generic-32 generic-1
 BUILTINS_SRC=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS))) \
 	builtins/dispatch.ll
 BUILTINS_OBJS=$(addprefix builtins-, $(notdir $(BUILTINS_SRC:.ll=.o))) \
diff --git a/builtins.cpp b/builtins.cpp
index 1682db9a..b94fa04f 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -847,6 +847,13 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                                builtins_bitcode_generic_16_length, 
                                module, symbolTable);
             break;
+        case 32:
+            extern unsigned char builtins_bitcode_generic_32[];
+            extern int builtins_bitcode_generic_32_length;
+            AddBitcodeToModule(builtins_bitcode_generic_32, 
+                               builtins_bitcode_generic_32_length, 
+                               module, symbolTable);
+            break;
 	case 1:
             extern unsigned char builtins_bitcode_generic_1[];
             extern int builtins_bitcode_generic_1_length;
diff --git a/builtins/target-generic-32.ll b/builtins/target-generic-32.ll
new file mode 100644
index 00000000..5f89bcdf
--- /dev/null
+++ b/builtins/target-generic-32.ll
@@ -0,0 +1,33 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`32')
+include(`target-generic-common.ll')
diff --git a/ispc.cpp b/ispc.cpp
index bd832825..3a2134d1 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -257,6 +257,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         t->allOffMaskIsSafe = true;
         t->maskBitCount = 1;
     }
+    else if (!strcasecmp(isa, "generic-32")) {
+        t->isa = Target::GENERIC;
+        t->nativeVectorWidth = 32;
+        t->vectorWidth = 32;
+        t->maskingIsFree = true;
+        t->allOffMaskIsSafe = true;
+        t->maskBitCount = 1;
+    }
     else if (!strcasecmp(isa, "generic-1")) {
         t->isa = Target::GENERIC;
         t->nativeVectorWidth = 1;
@@ -313,6 +321,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
         llvm::TargetMachine *targetMachine = t->GetTargetMachine();
         const llvm::TargetData *targetData = targetMachine->getTargetData();
         t->is32Bit = (targetData->getPointerSize() == 4);
+        Assert(t->vectorWidth <= ISPC_MAX_NVEC);
     }
 
     return !error;
@@ -344,7 +353,7 @@ Target::SupportedTargetISAs() {
 #ifndef LLVM_3_0
         ", avx2, avx2-x2"
 #endif // !LLVM_3_0
-        ", generic-4, generic-8, generic-16, generic-1";
+        ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }
 
 
diff --git a/ispc.h b/ispc.h
index d0d0c3f7..bb551a6d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -71,7 +71,7 @@
 /** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
     targets.
  */
-#define ISPC_MAX_NVEC 16
+#define ISPC_MAX_NVEC 32
 
 // Forward declarations of a number of widely-used LLVM types
 namespace llvm {
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 6971ce9a..34ef9373 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -29,6 +29,7 @@
     <ClCompile Include="gen-bitcode-generic-4.cpp" />
     <ClCompile Include="gen-bitcode-generic-8.cpp" />
     <ClCompile Include="gen-bitcode-generic-16.cpp" />
+    <ClCompile Include="gen-bitcode-generic-32.cpp" />
     <ClCompile Include="gen-bitcode-sse2.cpp" />
     <ClCompile Include="gen-bitcode-sse2-x2.cpp" />
     <ClCompile Include="gen-bitcode-sse4.cpp" />
@@ -264,6 +265,19 @@
       <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-16.cpp</Message>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-generic-32.ll">
+      <FileType>Document</FileType>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-generic-32.ll | python bitcode2cpp.py builtins\target-generic-32.ll &gt; gen-bitcode-generic-32.cpp</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-generic-32.cpp</Outputs>
+      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4;builtins\target-generic-common.ll</AdditionalInputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-generic-32.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="lex.ll">
       <FileType>Document</FileType>
diff --git a/opt.cpp b/opt.cpp
index 34cdab0f..063be681 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -265,6 +265,124 @@ lGEPInst(llvm::Value *ptr, llvm::Value *offset, const char *name,
 }
 
 
+/** Given a vector of constant values (int, float, or bool) representing an
+    execution mask, convert it to a bitvector where the 0th bit corresponds
+    to the first vector value and so forth.
+*/
+static uint32_t
+lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, 
+                                             ISPC_MAX_NVEC> &elements) {
+    Assert(elements.size() <= 32);
+
+    uint32_t mask = 0;
+    for (unsigned int i = 0; i < elements.size(); ++i) {
+        llvm::APInt intMaskValue;
+        // SSE has the "interesting" approach of encoding blending
+        // masks as <n x float>.
+        llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
+        if (cf != NULL) {
+            llvm::APFloat apf = cf->getValueAPF();
+            intMaskValue = apf.bitcastToAPInt();
+        }
+        else {
+            // Otherwise get it as an int
+            llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
+            Assert(ci != NULL);  // vs return -1 if NULL?
+            intMaskValue = ci->getValue();
+        }
+        // Is the high-bit set?  If so, OR in the appropriate bit in
+        // the result mask
+        if (intMaskValue.countLeadingOnes() > 0)
+            mask |= (1 << i);
+    }
+    return mask;
+}
+
+
+/** Given an llvm::Value represinting a vector mask, see if the value is a
+    constant.  If so, return true and set *bits to be the integer mask
+    found by taking the high bits of the mask values in turn and
+    concatenating them into a single integer.  In other words, given the
+    4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, we have 0b1001 = 9.
+ */
+static bool
+lGetMask(llvm::Value *factor, uint32_t *mask) {
+#ifndef LLVM_3_0
+    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
+    if (cdv != NULL) {
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+        for (int i = 0; i < (int)cdv->getNumElements(); ++i)
+            elements.push_back(cdv->getElementAsConstant(i));
+        *mask = lConstElementsToMask(elements);
+        return true;
+    }
+#endif
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
+    if (cv != NULL) {
+        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+#ifndef LLVM_3_0
+        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
+            llvm::Constant *c = 
+                llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
+            if (c == NULL)
+                return NULL;
+            elements.push_back(c);
+        }
+#else
+        cv->getVectorElements(elements);
+#endif
+        *mask = lConstElementsToMask(elements);
+        return true;
+    }
+    else if (llvm::isa<llvm::ConstantAggregateZero>(factor)) {
+        *mask = 0;
+        return true;
+    }
+    else {
+#if 0
+        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
+        if (ce != NULL) {
+            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
+            const llvm::TargetData *td = targetMachine->getTargetData();
+            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
+            c->dump();
+            factor = c;
+        }
+        // else we should be able to handle it above...
+        Assert(!llvm::isa<llvm::Constant>(factor));
+#endif
+        return false;
+    }
+}
+
+
+enum MaskStatus { ALL_ON, ALL_OFF, MIXED, UNKNOWN };
+
+/** Determines if the given mask value is all on, all off, mixed, or
+    unknown at compile time.
+*/
+static MaskStatus
+lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
+    uint32_t bits;
+    if (lGetMask(mask, &bits) == false)
+        return UNKNOWN;
+
+    if (bits == 0)
+        return ALL_OFF;
+
+    if (vecWidth == -1)
+        vecWidth = g->target.vectorWidth;
+    Assert(vecWidth <= 32);
+
+    for (int i = 0; i < vecWidth; ++i) {
+        if ((bits & (1ull << i)) == 0)
+            return MIXED;
+    }
+    return ALL_ON;
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 
 void
@@ -559,12 +677,12 @@ private:
         instruction for this optimization pass.
      */
     struct BlendInstruction {
-        BlendInstruction(llvm::Function *f, int ao, int o0, int o1, int of)
+        BlendInstruction(llvm::Function *f, uint32_t ao, int o0, int o1, int of)
             : function(f), allOnMask(ao), op0(o0), op1(o1), opFactor(of) { }
         /** Function pointer for the blend instruction */ 
         llvm::Function *function;
         /** Mask value for an "all on" mask for this instruction */
-        int allOnMask;
+        uint32_t allOnMask;
         /** The operand number in the llvm CallInst corresponds to the
             first operand to blend with. */
         int op0;
@@ -609,99 +727,6 @@ IntrinsicsOpt::IntrinsicsOpt()
 }
 
 
-/** Given a vector of constant values (int, float, or bool) representing an
-    execution mask, convert it to a bitvector where the 0th bit corresponds
-    to the first vector value and so forth.
-*/
-static int
-lConstElementsToMask(const llvm::SmallVector<llvm::Constant *, 
-                                             ISPC_MAX_NVEC> &elements) {
-    Assert(elements.size() <= 32);
-
-    int mask = 0;
-    for (unsigned int i = 0; i < elements.size(); ++i) {
-        llvm::APInt intMaskValue;
-        // SSE has the "interesting" approach of encoding blending
-        // masks as <n x float>.
-        llvm::ConstantFP *cf = llvm::dyn_cast<llvm::ConstantFP>(elements[i]);
-        if (cf != NULL) {
-            llvm::APFloat apf = cf->getValueAPF();
-            intMaskValue = apf.bitcastToAPInt();
-        }
-        else {
-            // Otherwise get it as an int
-            llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
-            Assert(ci != NULL);  // vs return -1 if NULL?
-            intMaskValue = ci->getValue();
-        }
-        // Is the high-bit set?  If so, OR in the appropriate bit in
-        // the result mask
-        if (intMaskValue.countLeadingOnes() > 0)
-            mask |= (1 << i);
-    }
-    return mask;
-}
-
-
-/** Given an llvm::Value represinting a vector mask, see if the value is a
-    constant.  If so, return the integer mask found by taking the high bits
-    of the mask values in turn and concatenating them into a single integer.
-    In other words, given the 4-wide mask: < 0xffffffff, 0, 0, 0xffffffff >, 
-    we have 0b1001 = 9.
- */
-static int
-lGetMask(llvm::Value *factor) {
-    /* FIXME: This will break if we ever do 32-wide compilation, in which case
-       it don't be possible to distinguish between -1 for "don't know" and
-       "known and all bits on". */
-    Assert(g->target.vectorWidth < 32);
-
-#ifndef LLVM_3_0
-    llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(factor);
-    if (cdv != NULL) {
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-        for (int i = 0; i < (int)cdv->getNumElements(); ++i)
-            elements.push_back(cdv->getElementAsConstant(i));
-        return lConstElementsToMask(elements);
-    }
-#endif
-
-    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(factor);
-    if (cv != NULL) {
-        llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
-#ifndef LLVM_3_0
-        for (int i = 0; i < (int)cv->getNumOperands(); ++i) {
-            llvm::Constant *c = 
-                llvm::dyn_cast<llvm::Constant>(cv->getOperand(i));
-            if (c == NULL)
-                return NULL;
-            elements.push_back(c);
-        }
-#else
-        cv->getVectorElements(elements);
-#endif
-        return lConstElementsToMask(elements);
-    }
-    else if (llvm::isa<llvm::ConstantAggregateZero>(factor))
-        return 0;
-    else {
-#if 0
-        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(factor);
-        if (ce != NULL) {
-            llvm::TargetMachine *targetMachine = g->target.GetTargetMachine();
-            const llvm::TargetData *td = targetMachine->getTargetData();
-            llvm::Constant *c = llvm::ConstantFoldConstantExpression(ce, td);
-            c->dump();
-            factor = c;
-        }
-        // else we should be able to handle it above...
-        Assert(!llvm::isa<llvm::Constant>(factor));
-#endif
-        return -1;
-    }
-}
-
-
 /** Given an llvm::Value, return true if we can determine that it's an
     undefined value.  This only makes a weak attempt at chasing this down,
     only detecting flat-out undef values, and bitcasts of undef values.
@@ -779,26 +804,28 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                 goto restart;
             }
 
-            int mask = lGetMask(factor);
-            llvm::Value *value = NULL;
-            if (mask == 0)
-                // Mask all off -> replace with the first blend value
-                value = v[0];
-            else if (mask == blend->allOnMask)
-                // Mask all on -> replace with the second blend value
-                value = v[1];
+            uint32_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                llvm::Value *value = NULL;
+                if (mask == 0)
+                    // Mask all off -> replace with the first blend value
+                    value = v[0];
+                else if (mask == blend->allOnMask)
+                    // Mask all on -> replace with the second blend value
+                    value = v[1];
 
-            if (value != NULL) {
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
-                                           iter, value);
-                modifiedAny = true;
-                goto restart;
+                if (value != NULL) {
+                    llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                               iter, value);
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
         else if (matchesMaskInstruction(callInst->getCalledFunction())) {
             llvm::Value *factor = callInst->getArgOperand(0);
-            int mask = lGetMask(factor);
-            if (mask != -1) {
+            uint32_t mask;
+            if (lGetMask(factor, &mask) == true) {
                 // If the vector-valued mask has a known value, replace it
                 // with the corresponding integer mask from its elements
                 // high bits.
@@ -812,71 +839,75 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         else if (callInst->getCalledFunction() == avxMaskedLoad32 ||
                  callInst->getCalledFunction() == avxMaskedLoad64) {
             llvm::Value *factor = callInst->getArgOperand(1);
-            int mask = lGetMask(factor);
-            if (mask == 0) {
-                // nothing being loaded, replace with undef value
-                llvm::Type *returnType = callInst->getType();
-                Assert(llvm::isa<llvm::VectorType>(returnType));
-                llvm::Value *undefValue = llvm::UndefValue::get(returnType);
-                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
-                                           iter, undefValue);
-                modifiedAny = true;
-                goto restart;
-            }
-            else if (mask == 0xff) {
-                // all lanes active; replace with a regular load
-                llvm::Type *returnType = callInst->getType();
-                Assert(llvm::isa<llvm::VectorType>(returnType));
-                // cast the i8 * to the appropriate type
-                const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
-                llvm::Value *castPtr = 
-                    new llvm::BitCastInst(callInst->getArgOperand(0),
-                                          llvm::PointerType::get(returnType, 0), 
-                                          name, callInst);
-                lCopyMetadata(castPtr, callInst);
-                int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
-                name = LLVMGetName(callInst->getArgOperand(0), "_load");
-                llvm::Instruction *loadInst = 
-                    new llvm::LoadInst(castPtr, name, false /* not volatile */,
-                                       align, (llvm::Instruction *)NULL);
-                lCopyMetadata(loadInst, callInst);
-                llvm::ReplaceInstWithInst(callInst, loadInst);
-                modifiedAny = true;
-                goto restart;
+            uint32_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                if (mask == 0) {
+                    // nothing being loaded, replace with undef value
+                    llvm::Type *returnType = callInst->getType();
+                    Assert(llvm::isa<llvm::VectorType>(returnType));
+                    llvm::Value *undefValue = llvm::UndefValue::get(returnType);
+                    llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                               iter, undefValue);
+                    modifiedAny = true;
+                    goto restart;
+                }
+                else if (mask == 0xff) {
+                    // all lanes active; replace with a regular load
+                    llvm::Type *returnType = callInst->getType();
+                    Assert(llvm::isa<llvm::VectorType>(returnType));
+                    // cast the i8 * to the appropriate type
+                    const char *name = LLVMGetName(callInst->getArgOperand(0), "_cast");
+                    llvm::Value *castPtr = 
+                        new llvm::BitCastInst(callInst->getArgOperand(0),
+                                              llvm::PointerType::get(returnType, 0), 
+                                              name, callInst);
+                    lCopyMetadata(castPtr, callInst);
+                    int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
+                    name = LLVMGetName(callInst->getArgOperand(0), "_load");
+                    llvm::Instruction *loadInst = 
+                        new llvm::LoadInst(castPtr, name, false /* not volatile */,
+                                           align, (llvm::Instruction *)NULL);
+                    lCopyMetadata(loadInst, callInst);
+                    llvm::ReplaceInstWithInst(callInst, loadInst);
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
         else if (callInst->getCalledFunction() == avxMaskedStore32 ||
                  callInst->getCalledFunction() == avxMaskedStore64) {
             // NOTE: mask is the 2nd parameter, not the 3rd one!!
             llvm::Value *factor = callInst->getArgOperand(1);
-            int mask = lGetMask(factor);
-            if (mask == 0) {
-                // nothing actually being stored, just remove the inst
-                callInst->eraseFromParent();
-                modifiedAny = true;
-                goto restart;
-            }
-            else if (mask == 0xff) {
-                // all lanes storing, so replace with a regular store
-                llvm::Value *rvalue = callInst->getArgOperand(2);
-                llvm::Type *storeType = rvalue->getType();
-                const char *name = LLVMGetName(callInst->getArgOperand(0),
-                                               "_ptrcast");
-                llvm::Value *castPtr = 
-                    new llvm::BitCastInst(callInst->getArgOperand(0),
-                                          llvm::PointerType::get(storeType, 0), 
-                                          name, callInst);
-                lCopyMetadata(castPtr, callInst);
+            uint32_t mask;
+            if (lGetMask(factor, &mask) == true) {
+                if (mask == 0) {
+                    // nothing actually being stored, just remove the inst
+                    callInst->eraseFromParent();
+                    modifiedAny = true;
+                    goto restart;
+                }
+                else if (mask == 0xff) {
+                    // all lanes storing, so replace with a regular store
+                    llvm::Value *rvalue = callInst->getArgOperand(2);
+                    llvm::Type *storeType = rvalue->getType();
+                    const char *name = LLVMGetName(callInst->getArgOperand(0),
+                                                   "_ptrcast");
+                    llvm::Value *castPtr = 
+                        new llvm::BitCastInst(callInst->getArgOperand(0),
+                                              llvm::PointerType::get(storeType, 0), 
+                                              name, callInst);
+                    lCopyMetadata(castPtr, callInst);
 
-                llvm::StoreInst *storeInst = 
-                    new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
-                int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
-                storeInst->setAlignment(align);
-                lCopyMetadata(storeInst, callInst);
-                llvm::ReplaceInstWithInst(callInst, storeInst);
+                    llvm::StoreInst *storeInst = 
+                        new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
+                    int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                    storeInst->setAlignment(align);
+                    lCopyMetadata(storeInst, callInst);
+                    llvm::ReplaceInstWithInst(callInst, storeInst);
 
-                modifiedAny = true;
-                goto restart;
+                    modifiedAny = true;
+                    goto restart;
+                }
             }
         }
     }
@@ -949,13 +980,13 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::SelectInst *selectInst = llvm::dyn_cast<llvm::SelectInst>(&*iter);
         if (selectInst != NULL && selectInst->getType()->isVectorTy()) {
             llvm::Value *factor = selectInst->getOperand(0);
-            int mask = lGetMask(factor);
-            int allOnMask = (1 << g->target.vectorWidth) - 1;
+
+            MaskStatus maskStatus = lGetMaskStatus(factor);
             llvm::Value *value = NULL;
-            if (mask == allOnMask)
+            if (maskStatus == ALL_ON)
                 // Mask all on -> replace with the first select value
                 value = selectInst->getOperand(1);
-            else if (mask == 0)
+            else if (maskStatus == ALL_OFF)
                 // Mask all off -> replace with the second select value
                 value = selectInst->getOperand(2);
 
@@ -976,8 +1007,8 @@ VSelMovmskOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         if (calledFunc == NULL || calledFunc != m->module->getFunction("__movmsk"))
             continue;
 
-        int mask = lGetMask(callInst->getArgOperand(0));
-        if (mask != -1) {
+        uint32_t mask;
+        if (lGetMask(callInst->getArgOperand(0), &mask) == true) {
 #if 0
             fprintf(stderr, "mask %d\n", mask);
             callInst->getArgOperand(0)->dump();
@@ -1964,10 +1995,8 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         llvm::Value *rvalue  = callInst->getArgOperand(1);
         llvm::Value *mask = callInst->getArgOperand(2);
 
-        int allOnMask = (1 << g->target.vectorWidth) - 1;
-
-        int maskAsInt = lGetMask(mask);
-        if (maskAsInt == 0) {
+        MaskStatus maskStatus = lGetMaskStatus(mask);
+        if (maskStatus == ALL_OFF) {
             // Zero mask - no-op, so remove the store completely.  (This
             // may in turn lead to being able to optimize out instructions
             // that compute the rvalue...)
@@ -1975,11 +2004,10 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             modifiedAny = true;
             goto restart;
         }
-        else if (maskAsInt == allOnMask) {
+        else if (maskStatus == ALL_ON) {
             // The mask is all on, so turn this into a regular store
             llvm::Type *rvalueType = rvalue->getType();
-            llvm::Type *ptrType = 
-                llvm::PointerType::get(rvalueType, 0);
+            llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
 
             lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
             lCopyMetadata(lvalue, callInst);
@@ -2072,20 +2100,18 @@ MaskedLoadOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Got one; grab the operands
         llvm::Value *ptr = callInst->getArgOperand(0);
         llvm::Value *mask  = callInst->getArgOperand(1);
-        int allOnMask = (1 << g->target.vectorWidth) - 1;
 
-        int maskAsInt = lGetMask(mask);
-        if (maskAsInt == 0) {
+        MaskStatus maskStatus = lGetMaskStatus(mask);
+        if (maskStatus == ALL_OFF) {
             // Zero mask - no-op, so replace the load with an undef value
             llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                        iter, llvm::UndefValue::get(callInst->getType()));
             modifiedAny = true;
             goto restart;
         }
-        else if (maskAsInt == allOnMask) {
+        else if (maskStatus == ALL_ON) {
             // The mask is all on, so turn this into a regular load
-            llvm::Type *ptrType = 
-                llvm::PointerType::get(callInst->getType(), 0);
+            llvm::Type *ptrType = llvm::PointerType::get(callInst->getType(), 0);
             ptr = new llvm::BitCastInst(ptr, ptrType, "ptr_cast_for_load", 
                                         callInst);
             llvm::Instruction *load = 
@@ -2558,18 +2584,6 @@ public:
 char GatherCoalescePass::ID = 0;
 
 
-/* Returns true if the mask is known at compile time to be "all on". */ 
-static bool
-lIsMaskAllOn(llvm::Value *mask) {
-    int m = lGetMask(mask);
-    if (m == -1)
-        return false;
-
-    int allOnMask = (1 << g->target.vectorWidth) - 1;
-    return (m == allOnMask);
-}
-
-
 /** Representation of a memory load that the gather coalescing code has
     decided to generate.
  */
@@ -3497,7 +3511,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // Then and only then do we have a common base pointer with all
         // offsets from that constants (in which case we can potentially
         // coalesce).
-        if (lIsMaskAllOn(mask) == false)
+        if (lGetMaskStatus(mask) != ALL_ON)
             continue;
 
         if (!LLVMVectorValuesAllEqual(variableOffsets))
diff --git a/run_tests.py b/run_tests.py
index ce5e98f1..79465267 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -33,7 +33,7 @@ parser.add_option("-r", "--random-shuffle", dest="random", help="Randomly order
 parser.add_option("-g", "--generics-include", dest="include_file", help="Filename for header implementing functions for generics",
                   default=None)
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16)',
+                  help='Set compilation target (sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (x86, x86-64)',
@@ -69,6 +69,9 @@ if is_generic_target and options.include_file == None:
     elif options.target == "generic-16":
         sys.stderr.write("No generics #include specified; using examples/intrinsics/generic-16.h\n")
         options.include_file = "examples/intrinsics/generic-16.h"
+    elif options.target == "generic-32":
+        sys.stderr.write("No generics #include specified and no default available for \"generic-32\" target.\n")
+        sys.exit(1)
 
 if options.compiler_exe == None:
     if is_windows: