diff --git a/builtins.cpp b/builtins.cpp
index d72bb371..d65c7308 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -151,6 +151,27 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
     const llvm::FunctionType *ftype = func->getFunctionType();
     std::string name = func->getName();
 
+    // An unfortunate hack: we want this builtin function to have the
+    // signature "int __sext_varying_bool(bool)", but the ispc function
+    // symbol creation code below assumes that any LLVM vector of i32s is a
+    // varying int32.  Here, we need that to be interpreted as a varying
+    // bool, so just have a one-off override for that one...
+    if (name == "__sext_varying_bool") {
+        const Type *returnType = AtomicType::VaryingInt32;
+        std::vector<const Type *> argTypes;
+        argTypes.push_back(AtomicType::VaryingBool);
+        std::vector<ConstExpr *> defaults;
+        defaults.push_back(NULL);
+
+        FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+        funcType->SetArgumentDefaults(defaults);
+
+        Symbol *sym = new Symbol(name, noPos, funcType);
+        sym->function = func;
+        symbolTable->AddFunction(sym);
+        return true;
+    }
+
     // If the function has any parameters with integer types, we'll make
     // two Symbols for two overloaded versions of the function, one with
     // all of the integer types treated as signed integers and one with all
diff --git a/docs/ispc.txt b/docs/ispc.txt
index 26d582e3..bee91ce6 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -509,11 +509,9 @@ is provided in parenthesis around the expression:
     double foo = 1. / 3.;
     int bar = (float)bar + (float)bar;  // 32-bit float addition
 
-Note: if a ``bool`` is converted to an integer numeric type (``int``,
-``int64``, etc.), then the conversion is done with sign extension, not zero
-extension.  Thus, the resulting value has all bits set if the ``bool`` is
-``true``; for example, ``0xffffffff`` for ``int32``.  This differs from C
-and C++, where a ``true`` bool is converted to the integer value one.
+If a ``bool`` is converted to an integer numeric type (``int``, ``int64``,
+etc.), then the result is the value one if the ``bool`` has the value
+``true`` and has the value zero otherwise.
 
 Variables can be declared with the ``const`` qualifier, which prohibits
 their modification.
@@ -1895,6 +1893,16 @@ code.
 Low-Level Bits
 --------------
 
+Sometimes it's useful to convert a ``bool`` value to an integer using sign
+extension so that the integer's bits are all on if the ``bool`` has the
+value ``true`` (rather than just having the value one).  The
+``sign_extend()`` functions provide this functionality:
+
+::
+
+    int sign_extend(bool value) 
+    uniform int sign_extend(uniform bool value) 
+
 ``ispc`` provides a number of bit/memory-level utility routines in its
 standard library as well.  It has routines that load from and store
 to 8-bit and 16-bit integer values stored in memory, converting to and from
@@ -1964,7 +1972,6 @@ It, it clears the high order bit, to ensure that the given floating-point
 value is positive.  This compiles down to a single ``andps`` instruction
 when used with an Intel® SSE target, for example.
 
-
 Interoperability with the Application
 =====================================
 
diff --git a/expr.cpp b/expr.cpp
index 0ce3fc16..6b20b7e3 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -3673,14 +3673,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            // FIXME: we're currently doing sign extension rather than zero
-            // extension here, which means that ints will have the value
-            // 0xffffffff for 'true' bools (versus the value 1).  There is
-            // some code in stdlib.ispc that depends on bool->int conversions
-            // having this behavior that needs to be cleaned up (e.g. to
-            // call a __sext() builtin to do bool->int conversions) if we
-            // are going to fix this here.
-            cast = ctx->SExtInst(exprVal, targetType, "bool2int");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
             break;
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
@@ -3712,9 +3705,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            // FIXME: See comments above w.r.t. fixing this to be a
-            // ZExtInst rather than an SExtInst...
-            cast = ctx->SExtInst(exprVal, targetType, "bool2uint");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
             break;
         case AtomicType::TYPE_INT32:
         case AtomicType::TYPE_UINT32:
@@ -3752,7 +3743,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             if (fromType->IsVaryingType() &&
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->SExtInst(exprVal, targetType, "bool2int64");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2int64");
             break;
         case AtomicType::TYPE_INT32:
             cast = ctx->SExtInst(exprVal, targetType, "int32_to_int64");
@@ -3786,7 +3777,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
             if (fromType->IsVaryingType() && 
                 LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                 exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
-            cast = ctx->SExtInst(exprVal, targetType, "bool2uint");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
             break;
         case AtomicType::TYPE_INT32:
             cast = ctx->SExtInst(exprVal, targetType, "int32_to_uint64");
diff --git a/stdlib.ispc b/stdlib.ispc
index ec94c4c8..5baaa2f1 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -197,18 +197,25 @@ static inline unsigned int64 insert(unsigned int64 x, uniform int i,
     return __insert_int64(x, (unsigned int)i, v);
 }
 
+static inline uniform int32 sign_extend(uniform bool v) {
+    return __sext_uniform_bool(v);
+}
+
+static inline int32 sign_extend(bool v) {
+    return __sext_varying_bool(v);
+}
 
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
-    return __movmsk(v & __mask) != 0;
+    return __movmsk(__sext_varying_bool(v) & __mask) != 0;
 }
 
 static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
-    bool match = ((v & __mask) == __mask);
-    return __movmsk((int)match) == (1 << programCount) - 1;
+    int32 match = __sext_varying_bool((__sext_varying_bool(v) & __mask) == __mask);
+    return __movmsk(match) == (1 << programCount) - 1;
 }
 
 static inline uniform int popcnt(uniform int v) {
@@ -235,7 +242,7 @@ static inline int popcnt(int64 v) {
 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-    return __popcnt_int32(__movmsk(v & __mask));
+    return __popcnt_int32(__movmsk(__sext_varying_bool(v) & __mask));
 }
 
 static inline uniform int lanemask() {
@@ -271,7 +278,7 @@ static inline uniform float reduce_max(float v) {
 
 static inline uniform int reduce_add(int x) {
     // Zero out the values for lanes that aren't running
-    return __reduce_add_int32(x & (int)__mask);
+    return __reduce_add_int32(x & __mask);
 }
 
 static inline uniform int reduce_min(int v) {
@@ -291,7 +298,7 @@ static inline uniform int reduce_max(int v) {
 static inline uniform unsigned int reduce_add(unsigned int x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
-    return __reduce_add_uint32(x & (int)__mask);
+    return __reduce_add_uint32(x & __mask);
 }
 
 static inline uniform unsigned int reduce_min(unsigned int v) {
@@ -329,7 +336,7 @@ static inline uniform double reduce_max(double v) {
 
 static inline uniform int64 reduce_add(int64 x) {
     // Zero out the values for lanes that aren't running
-    return __reduce_add_int64(x & (int64)__mask);
+    return __reduce_add_int64(x & (int64)(__mask));
 }
 
 static inline uniform int64 reduce_min(int64 v) {
@@ -349,7 +356,7 @@ static inline uniform int64 reduce_max(int64 v) {
 static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
     // Set values for non-running lanes to zero so they don't affect the
     // result.
-    return __reduce_add_int64(x & (int64)__mask);
+    return __reduce_add_int64(x & (int64)(__mask));
 }
 
 static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
diff --git a/stdlib.m4 b/stdlib.m4
index 797aeb51..49184d85 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -644,6 +644,17 @@ define internal float @__undef_uniform() nounwind readnone alwaysinline {
   ret float undef
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sign extension
+
+define internal i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
+  %r = sext i1 %0 to i32
+  ret i32 %r
+}
+
+define internal <$1 x i32> @__sext_varying_bool(<$1 x i32>) nounwind readnone alwaysinline {
+  ret <$1 x i32> %0
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib transcendentals
diff --git a/tests/movmsk-opt.ispc b/tests/movmsk-opt.ispc
index 70ce5211..6a360dff 100644
--- a/tests/movmsk-opt.ispc
+++ b/tests/movmsk-opt.ispc
@@ -20,7 +20,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
 
     float v = float4(1,1,0,0);
     bool b = (v == 1.);
-    ret = __movmsk(((int)b));
+    ret = __movmsk((sign_extend(b)));
     RET[programIndex] = ret;
 }
 
diff --git a/tests/sign-extend-1.ispc b/tests/sign-extend-1.ispc
new file mode 100644
index 00000000..07e082c7
--- /dev/null
+++ b/tests/sign-extend-1.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    bool b = (a == 1.);
+    int32 s = sign_extend(b);
+    RET[programIndex] = (s == 0xffffffff) ? 16 : 2;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+    RET[0] = 16;
+}
diff --git a/tests/sign-extend.ispc b/tests/sign-extend.ispc
new file mode 100644
index 00000000..05521e4f
--- /dev/null
+++ b/tests/sign-extend.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    bool b = (a == 1.);
+    RET[programIndex] = (int)b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+    RET[0] = 1;
+}