Add support for int8/int16 types. Addresses issues #9 and #42.

2011-07-21 06:57:40 +01:00
parent 2d573acd17
commit bba7211654
64 changed files with 2317 additions and 885 deletions
--- a/2
+++ b/2
@@ -15,7 +15,7 @@ LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
 LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)
 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log | head -1)
+BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)
 CXX=g++
 CPP=cpp
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -78,8 +78,14 @@ static const Type *
 lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    if (t == LLVMTypes::VoidType)
        return AtomicType::Void;
    // uniform
    else if (t == LLVMTypes::BoolType)
        return AtomicType::UniformBool;
    else if (t == LLVMTypes::Int8Type)
        return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
    else if (t == LLVMTypes::Int16Type)
        return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
    else if (t == LLVMTypes::Int32Type)
        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
    else if (t == LLVMTypes::FloatType)
@@ -88,6 +94,12 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::UniformDouble;
    else if (t == LLVMTypes::Int64Type)
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
    // varying
    else if (t == LLVMTypes::Int8VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
    else if (t == LLVMTypes::Int16VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
    else if (t == LLVMTypes::Int32VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
    else if (t == LLVMTypes::FloatVectorType)
@@ -96,6 +108,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::VaryingDouble;
    else if (t == LLVMTypes::Int64VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
    // pointers to uniform
    else if (t == LLVMTypes::Int8PointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
                                                 AtomicType::UniformInt8, false);
    else if (t == LLVMTypes::Int16PointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
                                                 AtomicType::UniformInt16, false);
    else if (t == LLVMTypes::Int32PointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
                                                 AtomicType::UniformInt32, false);
@@ -106,6 +126,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return new ReferenceType(AtomicType::UniformFloat, false);
    else if (t == LLVMTypes::DoublePointerType)
        return new ReferenceType(AtomicType::UniformDouble, false);
    // pointers to varying
    else if (t == LLVMTypes::Int8VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
                                                 AtomicType::VaryingInt8, false);
    else if (t == LLVMTypes::Int16VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
                                                 AtomicType::VaryingInt16, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
                                                 AtomicType::VaryingInt32, false);
@@ -116,6 +144,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return new ReferenceType(AtomicType::VaryingFloat, false);
    else if (t == LLVMTypes::DoubleVectorPointerType)
        return new ReferenceType(AtomicType::VaryingDouble, false);
    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
@@ -239,10 +269,49 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    }
 }
 static void
 lDeclarePG(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
           const char *name) {
    SourcePos noPos;
    noPos.name = "__stdlib";
    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
    argTypes.push_back(LLVMTypes::VoidPointerVectorType);
    argTypes.push_back(LLVMTypes::MaskType);
    llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false);
    llvm::Function *func =
        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                               name, module);
    func->setOnlyReadsMemory(true);
    func->setDoesNotThrow(true);
 }
 static void
 lDeclarePGBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
             const char *name) {
    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
    argTypes.push_back(LLVMTypes::VoidPointerType);
    argTypes.push_back(LLVMTypes::Int32VectorType);
    argTypes.push_back(LLVMTypes::MaskType);
    llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false);
    llvm::Function *func =
        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                               name, module);
    func->setOnlyReadsMemory(true);
    func->setDoesNotThrow(true);
 }
 /** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
    to perform a gather, it generates a call to one of these functions,
    which have signatures:
    varying int8  __pseudo_gather(varying int8 *, mask)
    varying int16 __pseudo_gather(varying int16 *, mask)
    varying int32 __pseudo_gather(varying int32 *, mask)
    varying int64 __pseudo_gather(varying int64 *, mask)
@@ -253,6 +322,10 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    front-end to be relatively simple in how it emits address calculation
    for gathers.
    varying int8  __pseudo_gather_base_offsets_8(uniform int8 *base, 
                                                 int32 offsets, mask)
    varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, 
                                                  int32 offsets, mask)
    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
                                                  int32 offsets, mask)
    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
@@ -264,49 +337,54 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
 */
 static void
 lDeclarePseudoGathers(llvm::Module *module) {
-    SourcePos noPos;
+    lDeclarePG(module, LLVMTypes::Int8VectorType, "__pseudo_gather_8");
-    noPos.name = "__stdlib";
+    lDeclarePG(module, LLVMTypes::Int16VectorType, "__pseudo_gather_16");
    lDeclarePG(module, LLVMTypes::Int32VectorType, "__pseudo_gather_32");
    lDeclarePG(module, LLVMTypes::Int64VectorType, "__pseudo_gather_64");
-    {
+    lDeclarePGBO(module, LLVMTypes::Int8VectorType,
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+                 "__pseudo_gather_base_offsets_8");
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+    lDeclarePGBO(module, LLVMTypes::Int16VectorType,
-        argTypes.push_back(LLVMTypes::MaskType);
+                 "__pseudo_gather_base_offsets_16");
    lDeclarePGBO(module, LLVMTypes::Int32VectorType,
                 "__pseudo_gather_base_offsets_32");
    lDeclarePGBO(module, LLVMTypes::Int64VectorType,
                 "__pseudo_gather_base_offsets_64");
 }
        llvm::FunctionType *fType = 
            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
        llvm::Function *func =
            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                                   "__pseudo_gather_32", module);
        func->setOnlyReadsMemory(true);
        func->setDoesNotThrow(true);
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+static void
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+lDeclarePS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
-                                      "__pseudo_gather_64", module);
+           const char *name) {
-        func->setOnlyReadsMemory(true);
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        func->setDoesNotThrow(true);
+    argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-    }
+    argTypes.push_back(vecType);
    argTypes.push_back(LLVMTypes::MaskType);
-    {
+    llvm::FunctionType *fType = 
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        argTypes.push_back(LLVMTypes::VoidPointerType);
+    llvm::Function *func =
-        argTypes.push_back(LLVMTypes::Int32VectorType);
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-        argTypes.push_back(LLVMTypes::MaskType);
+                               name, module);
    func->setDoesNotThrow(true);
 }
        llvm::FunctionType *fType = 
            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
        llvm::Function *func =
            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                                   "__pseudo_gather_base_offsets_32", module);
        func->setOnlyReadsMemory(true);
        func->setDoesNotThrow(true);
-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
+static void
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+lDeclarePSBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, 
-                                      "__pseudo_gather_base_offsets_64", module);
+             const char *name) {
-        func->setOnlyReadsMemory(true);
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        func->setDoesNotThrow(true);
+    argTypes.push_back(LLVMTypes::VoidPointerType);
-    }
+    argTypes.push_back(LLVMTypes::Int32VectorType);
    argTypes.push_back(vecType);
    argTypes.push_back(LLVMTypes::MaskType);
    llvm::FunctionType *fType = 
        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
    llvm::Function *func =
        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                               name, module);
    func->setDoesNotThrow(true);
 }
@@ -314,16 +392,22 @@ lDeclarePseudoGathers(llvm::Module *module) {
    we also declare (but never define) pseudo-scatter instructions with
    signatures:
    void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask)
    void __pseudo_scatter_16(varying int16 *, varying int16 values, mask)
    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)
    The GatherScatterFlattenOpt optimization pass also finds these and
    transforms them to scatters like:
    void __pseudo_scatter_base_offsets_8(uniform int8 *base, 
                    varying int32 offsets, varying int8 values, mask)
    void __pseudo_scatter_base_offsets_16(uniform int16 *base, 
                    varying int32 offsets, varying int16 values, mask)
    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
                    varying int32 offsets, varying int32 values, mask)
    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-                    varying int62 offsets, varying int64 values, mask)
+                    varying int32 offsets, varying int64 values, mask)
    And the GSImprovementsPass in turn converts these to actual native
    scatters or masked stores.  
@@ -333,67 +417,49 @@ lDeclarePseudoScatters(llvm::Module *module) {
    SourcePos noPos;
    noPos.name = "__stdlib";
-    {
+    lDeclarePS(module, LLVMTypes::Int8VectorType, "__pseudo_scatter_8");
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    lDeclarePS(module, LLVMTypes::Int16VectorType, "__pseudo_scatter_16");
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+    lDeclarePS(module, LLVMTypes::Int32VectorType, "__pseudo_scatter_32");
-        argTypes.push_back(LLVMTypes::Int32VectorType);
+    lDeclarePS(module, LLVMTypes::Int64VectorType, "__pseudo_scatter_64");
        argTypes.push_back(LLVMTypes::MaskType);
-        llvm::FunctionType *fType = 
+    lDeclarePSBO(module, LLVMTypes::Int8VectorType, 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+                 "__pseudo_scatter_base_offsets_8");
-        llvm::Function *func =
+    lDeclarePSBO(module, LLVMTypes::Int16VectorType, 
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                 "__pseudo_scatter_base_offsets_16");
-                                   "__pseudo_scatter_32", module);
+    lDeclarePSBO(module, LLVMTypes::Int32VectorType, 
-        func->setDoesNotThrow(true);
+                 "__pseudo_scatter_base_offsets_32");
-    }
+    lDeclarePSBO(module, LLVMTypes::Int64VectorType, 
-    {
+                 "__pseudo_scatter_base_offsets_64");
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+}
        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
        argTypes.push_back(LLVMTypes::Int64VectorType);
        argTypes.push_back(LLVMTypes::MaskType);
        llvm::FunctionType *fType = 
            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
        llvm::Function *func =
            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                                   "__pseudo_scatter_64", module);
        func->setDoesNotThrow(true);
    }
-    {
+static void
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+lDeclarePMS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *lvalueType, 
-        argTypes.push_back(LLVMTypes::VoidPointerType);
+            LLVM_TYPE_CONST llvm::Type *rvalueType, const char *name) {
-        argTypes.push_back(LLVMTypes::Int32VectorType);
+    SourcePos noPos;
-        argTypes.push_back(LLVMTypes::Int32VectorType);
+    noPos.name = "__stdlib";
        argTypes.push_back(LLVMTypes::MaskType);
-        llvm::FunctionType *fType = 
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    argTypes.push_back(lvalueType);
-        llvm::Function *func =
+    argTypes.push_back(rvalueType);
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+    argTypes.push_back(LLVMTypes::MaskType);
                                   "__pseudo_scatter_base_offsets_32", module);
        func->setDoesNotThrow(true);
    }
    {
        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
        argTypes.push_back(LLVMTypes::VoidPointerType);
        argTypes.push_back(LLVMTypes::Int32VectorType);
        argTypes.push_back(LLVMTypes::Int64VectorType);
        argTypes.push_back(LLVMTypes::MaskType);
-        llvm::FunctionType *fType = 
+    llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
+    llvm::Function *func = 
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_64", module);
+                               name, module);
-        func->setDoesNotThrow(true);
+    func->setDoesNotThrow(true);
-    }
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
    func->setDoesNotCapture(1, true);
 }
 /** This function declares placeholder masked store functions for the
    front-end to use.
    void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask)
    void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask)
    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)
@@ -403,40 +469,14 @@ lDeclarePseudoScatters(llvm::Module *module) {
 */
 static void
 lDeclarePseudoMaskedStore(llvm::Module *module) {
-    SourcePos noPos;
+    lDeclarePMS(module, LLVMTypes::Int8VectorPointerType,
-    noPos.name = "__stdlib";
+                LLVMTypes::Int8VectorType, "__pseudo_masked_store_8");
-
+    lDeclarePMS(module, LLVMTypes::Int16VectorPointerType,
-    {
+                LLVMTypes::Int16VectorType, "__pseudo_masked_store_16");
-    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    lDeclarePMS(module, LLVMTypes::Int32VectorPointerType, 
-    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
+                LLVMTypes::Int32VectorType, "__pseudo_masked_store_32");
-    argTypes.push_back(LLVMTypes::Int32VectorType);
+    lDeclarePMS(module, LLVMTypes::Int64VectorPointerType, 
-    argTypes.push_back(LLVMTypes::MaskType);
+                LLVMTypes::Int64VectorType, "__pseudo_masked_store_64");
    llvm::FunctionType *fType = 
        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
    llvm::Function *func = 
        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                               "__pseudo_masked_store_32", module);
    func->setDoesNotThrow(true);
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    func->setDoesNotCapture(1, true);
    }
    {
    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
    argTypes.push_back(LLVMTypes::Int64VectorType);
    argTypes.push_back(LLVMTypes::MaskType);
    llvm::FunctionType *fType = 
        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
    llvm::Function *func = 
        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
                               "__pseudo_masked_store_64", module);
    func->setDoesNotThrow(true);
    func->addFnAttr(llvm::Attribute::AlwaysInline);
    func->setDoesNotCapture(1, true);
    }
 }
@@ -609,8 +649,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // needed by the compiled program.
    { 
        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(LLVMTypes::VoidPointerType);
        argTypes.push_back(LLVMTypes::Int32Type);
        argTypes.push_back(LLVMTypes::Int32Type);
        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1448,17 +1448,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
    llvm::Value *mask = GetMask();
    llvm::Function *gather = NULL;
    // Figure out which gather function to call based on the size of
-    // the elements; will need to generalize this for 8 and 16-bit
+    // the elements.
    // types.
    if (retType == LLVMTypes::DoubleVectorType || 
        retType == LLVMTypes::Int64VectorType)
        gather = m->module->getFunction("__pseudo_gather_64");
-    else {
+    else if (retType == LLVMTypes::FloatVectorType || 
-        assert(retType == LLVMTypes::FloatVectorType || 
+             retType == LLVMTypes::Int32VectorType)
               retType == LLVMTypes::Int32VectorType);
        gather = m->module->getFunction("__pseudo_gather_32");
    else if (retType == LLVMTypes::Int16VectorType)
        gather = m->module->getFunction("__pseudo_gather_16");
    else {
        assert(retType == LLVMTypes::Int8VectorType);
        gather = m->module->getFunction("__pseudo_gather_8");
    }
-    assert(gather);
+    assert(gather != NULL);
    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
    llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
@@ -1578,9 +1581,7 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
    rvalueType = rvalueType->GetAsNonConstType();
    llvm::Function *maskedStoreFunc = NULL;
-    // Figure out if we need a 32-bit or 64-bit masked store.  This
+    // Figure out if we need a 8, 16, 32 or 64-bit masked store.
    // will need to be generalized when/if 8 and 16-bit data types are
    // added.
    if (rvalueType == AtomicType::VaryingDouble || 
        rvalueType == AtomicType::VaryingInt64 ||
        rvalueType == AtomicType::VaryingUInt64) {
@@ -1590,13 +1591,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
                             "rvalue_to_int64");
    }
-    else {
+    else if (rvalueType == AtomicType::VaryingFloat ||
-        assert(rvalueType == AtomicType::VaryingFloat ||
+             rvalueType == AtomicType::VaryingBool ||
-               rvalueType == AtomicType::VaryingBool ||
+             rvalueType == AtomicType::VaryingInt32 ||
-               rvalueType == AtomicType::VaryingInt32 ||
+             rvalueType == AtomicType::VaryingUInt32 ||
-               rvalueType == AtomicType::VaryingUInt32 ||
+             dynamic_cast<const EnumType *>(rvalueType) != NULL) {
               dynamic_cast<const EnumType *>(rvalueType) != NULL);
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
                             "lvalue_to_int32vecptr");
@@ -1604,6 +1603,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
                                 "rvalue_to_int32");
    }
    else if (rvalueType == AtomicType::VaryingInt16 ||
             rvalueType == AtomicType::VaryingUInt16) {
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType, 
                             "lvalue_to_int16vecptr");
    }
    else if (rvalueType == AtomicType::VaryingInt8 ||
             rvalueType == AtomicType::VaryingUInt8) {
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType, 
                             "lvalue_to_int8vecptr");
    }
    std::vector<llvm::Value *> args;
    args.push_back(lvalue);
@@ -1668,14 +1679,15 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
        func = m->module->getFunction("__pseudo_scatter_64");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
    }
-    else {
+    else if (type == LLVMTypes::FloatVectorType || 
-        // FIXME: if this hits, presumably it's due to needing int8 and/or
+             type == LLVMTypes::Int32VectorType) {
        // int16 versions of scatter...
        assert(type == LLVMTypes::FloatVectorType || 
               type == LLVMTypes::Int32VectorType);
        func = m->module->getFunction("__pseudo_scatter_32");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
    }
    else if (type == LLVMTypes::Int16VectorType)
        func = m->module->getFunction("__pseudo_scatter_16");
    else if (type == LLVMTypes::Int8VectorType)
        func = m->module->getFunction("__pseudo_scatter_8");
    assert(func != NULL);
    AddInstrumentationPoint("scatter");
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -427,7 +427,8 @@ The following identifiers are reserved as language keywords: ``bool``,
 ``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
 ``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
 ``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
-``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
+``int8``, ``int16``, ``int32``, ``int64``, ``launch``, ``print``,
 ``reference``, ``return``,
 ``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
 ``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
 ``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
@@ -481,6 +482,10 @@ types.
 * ``void``: "empty" type representing no value.
 * ``bool``: boolean value; may be assigned ``true``, ``false``, or the
  value of a boolean expression.
 * ``int8``: 8-bit signed integer.
 * ``unsigned int8``: 8-bit unsigned integer.
 * ``int16``: 16-bit signed integer.
 * ``unsigned int16``: 16-bit unsigned integer.
 * ``int``: 32-bit signed integer; may also be specified as ``int32``.
 * ``unsigned int``: 32-bit unsigned integer; may also be specified as
  ``unsigned int32``.
@@ -497,7 +502,8 @@ general" of the two types, with the following precedence:
 ::
-  double > uint64 > int64 > float > uint32 > int32 > bool
+  double > uint64 > int64 > float > uint32 > int32 > 
      uint16 > int16 > uint8 > int8 > bool
 In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
 be converted to a ``double``, the addition to be performed, and a
@@ -1709,10 +1715,12 @@ the running program instances.
 ::
-    float broadcast(float value, uniform int index)
+    int8 broadcast(int8 value, uniform int index)
    int16 broadcast(int16 value, uniform int index)
    int32 broadcast(int32 value, uniform int index)
    double broadcast(double value, uniform int index)
    int64 broadcast(int64 value, uniform int index)
    float broadcast(float value, uniform int index)
    double broadcast(double value, uniform int index)
 The ``rotate()`` function allows each program instance to find the value of
 the given value that their neighbor ``offset`` steps away has.  For
@@ -1725,10 +1733,12 @@ provided offset value can be positive or negative, and may be greater than
 ::
-    float rotate(float value, uniform int offset)
+    int8 rotate(int8 value, uniform int offset)
    int16 rotate(int16 value, uniform int offset)
    int32 rotate(int32 value, uniform int offset)
    double rotate(double value, uniform int offset)
    int64 rotate(int64 value, uniform int offset)
    float rotate(float value, uniform int offset)
    double rotate(double value, uniform int offset)
 Finally, the ``shuffle()`` functions allow two variants of fully general
@@ -1739,10 +1749,12 @@ from which to get the value of ``value``.  The provided values for
 ::
-    float shuffle(float value, int permutation)
+    int8 shuffle(int8 value, int permutation)
    int16 shuffle(int16 value, int permutation)
    int32 shuffle(int32 value, int permutation)
    double shuffle(double value, int permutation)
    int64 shuffle(int64 value, int permutation)
    float shuffle(float value, int permutation)
    double shuffle(double value, int permutation)
 The second variant of ``shuffle()`` permutes over the extended vector that
@@ -1753,10 +1765,12 @@ of ``value1``, etc.)
 ::
-    float shuffle(float value0, float value1, int permutation)
+    int8 shuffle(int8 value0, int8 value1, int permutation)
    int16 shuffle(int16 value0, int16 value1, int permutation)
    int32 shuffle(int32 value0, int32 value1, int permutation)
    double shuffle(double value0, double value1, int permutation)
    int64 shuffle(int64 value0, int64 value1, int permutation)
    float shuffle(float value0, float value1, int permutation)
    double shuffle(double value0, double value1, int permutation)
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.
@@ -1861,10 +1875,19 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``
 ::
    uniform int8 extract(int8 x, uniform int i)
    uniform int16 extract(int16 x, uniform int i)
    uniform int32 extract(int32 x, uniform int i)
    uniform int64 extract(int64 x, uniform int i)
    uniform float extract(float x, uniform int i)
-    uniform int extract(int x, uniform int i)
+
 ::
    int8 insert(int8 x, uniform int i, uniform int8 v)
    int16 insert(int16 x, uniform int i, uniform int16 v)
    int32 insert(int32 x, uniform int i, uniform int32 v)
    int64 insert(int64 x, uniform int i, uniform int64 v)
    float insert(float x, uniform int i, uniform float v)
    int insert(int x, uniform int i, uniform int v)
 Atomic Operations and Memory Fences
@@ -1948,41 +1971,6 @@ value ``true`` (rather than just having the value one).  The
    int sign_extend(bool value) 
    uniform int sign_extend(uniform bool value) 
 ``ispc`` provides a number of bit/memory-level utility routines in its
 standard library as well.  It has routines that load from and store
 to 8-bit and 16-bit integer values stored in memory, converting to and from
 32-bit integers for use in computation in ``ispc`` code.  (These functions
 and this conversion step are necessary because ``ispc`` doesn't have native
 8-bit or 16-bit types in the language.)
 ::
    int load_from_int8(uniform int a[], uniform int offset)
    unsigned int load_from_int8(uniform unsigned int a[],
                                uniform int offset)
    void store_to_int8(uniform int a[], uniform int offset, 
                       int val)
    void store_to_int8(uniform unsigned int a[], uniform int offset, 
                       unsigned int val)
    unsigned int load_from_int16(uniform int a[],
                                 uniform int offset)
    unsigned unsigned int load_from_int16(uniform unsigned int a[],
                                          uniform int offset)
    void store_to_int16(uniform int a[], uniform int offset, 
                        int val)
    void store_to_int16(uniform unsigned int a[], uniform int offset, 
                        unsigned int val)
 There are three things to note in these functions.  First, note that these
 functions take either ``int`` or ``unsigned int`` arrays as parameters; you
 need to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side
 to ``int`` or ``unsigned int`` when passing them to ``ispc`` code.  Second,
 although the arrays are passed as 32-bit integers, in the array indexing
 calculation, with the ``offset`` parameter, they are treated as if they
 were ``int8`` or ``int16`` types (i.e. the offset treated as being in terms
 of number of 8 or 16-bit elements).  Third, note that the value of
 ``programIndex`` is implicitly added to offset.
 The ``intbits()`` and ``floatbits()`` functions can be used to implement
 low-level floating-point bit twiddling.  For example, ``intbits()`` returns
 an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -190,7 +190,9 @@ int main(int argc, char *argv[]) {
        nodes[i].bounds[1].v[1] = b[4];
        nodes[i].bounds[1].v[2] = b[5];
        READ(nodes[i].offset, 1);
-        READ(nodes[i].primsAxis, 1);
+        READ(nodes[i].nPrimitives, 1);
        READ(nodes[i].splitAxis, 1);
        READ(nodes[i].pad, 1);
    }
    // And then read the triangles 
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -50,21 +50,11 @@ struct Triangle {
 struct LinearBVHNode {
    uniform float3 bounds[2];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    uniform unsigned int8 nPrimitives;
    uniform unsigned int8 splitAxis;
    uniform unsigned int16 pad;
 };
 static inline uniform int nPrims(const reference LinearBVHNode node) {
    return (node.primsAxis & 0xff);
 }
 static inline uniform int axis(const reference LinearBVHNode node) {
    return ((node.primsAxis >> 8) & 0xff);
 }
 static inline uniform bool isInterior(const reference LinearBVHNode node) {
    return nPrims(node) == 0;
 }
 static inline float3 Cross(const float3 v1, const float3 v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -199,7 +189,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
-            uniform unsigned int nPrimitives = nPrims(node);
+            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                uniform unsigned int primitivesOffset = node.offset;
@@ -213,7 +203,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -75,30 +75,20 @@ struct Ray {
 namespace ispc {
    struct Triangle {
        float3 p[3];
-        int id;
+        int32_t id;
    };
    struct LinearBVHNode {
        float3 bounds[2];
-        unsigned int offset;     // primitives for leaf, second child for interior
+        int32_t offset;     // primitives for leaf, second child for interior
-        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+        uint8_t nPrimitives;
        uint8_t splitAxis;
        uint16_t pad;
    };
 }
 using namespace ispc;
 inline int nPrims(const LinearBVHNode &node) {
    return (node.primsAxis & 0xff);
 }
 inline int axis(const LinearBVHNode &node) {
    return ((node.primsAxis >> 8) & 0xff);
 }
 inline bool isInterior(const LinearBVHNode &node) {
    return nPrims(node) == 0;
 }
 inline float3 Cross(const float3 &v1, const float3 &v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -230,7 +220,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        const LinearBVHNode &node = nodes[nodeNum];
        if (BBoxIntersect(node.bounds, ray)) {
-            unsigned int nPrimitives = nPrims(node);
+            unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                unsigned int primitivesOffset = node.offset;
@@ -244,7 +234,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
--- a/expr.cpp
+++ b/expr.cpp
@@ -93,6 +93,10 @@ lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType,
                            SourcePos pos, const char *errorMsgBase) {
    switch (toAtomicType->basicType) {
    case AtomicType::TYPE_BOOL:
    case AtomicType::TYPE_INT8:
    case AtomicType::TYPE_UINT8:
    case AtomicType::TYPE_INT16:
    case AtomicType::TYPE_UINT16:
    case AtomicType::TYPE_INT32:
    case AtomicType::TYPE_UINT32:
    case AtomicType::TYPE_FLOAT:
@@ -101,6 +105,10 @@ lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType,
    case AtomicType::TYPE_DOUBLE:
        if ((int)toAtomicType->basicType < (int)fromAtomicType->basicType &&
            toAtomicType->basicType != AtomicType::TYPE_BOOL &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT8 && 
              fromAtomicType->basicType == AtomicType::TYPE_UINT8) &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT16 && 
              fromAtomicType->basicType == AtomicType::TYPE_UINT16) &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT32 && 
              fromAtomicType->basicType == AtomicType::TYPE_UINT32) &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT64 && 
@@ -363,15 +371,33 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
                return (value != 0.) ? LLVMTrue : LLVMFalse;
            else
                return LLVMBoolVector(value != 0.);
-        case AtomicType::TYPE_UINT32: {
+        case AtomicType::TYPE_INT8: {
            int i = (int)value;
            assert((double)i == value);
            return isUniform ? LLVMInt8(i) : LLVMInt8Vector(i);
        }
        case AtomicType::TYPE_UINT8: {
            unsigned int i = (unsigned int)value;
-            return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i);
+            return isUniform ? LLVMUInt8(i) : LLVMUInt8Vector(i);
        }
        case AtomicType::TYPE_INT16: {
            int i = (int)value;
            assert((double)i == value);
            return isUniform ? LLVMInt16(i) : LLVMInt16Vector(i);
        }
        case AtomicType::TYPE_UINT16: {
            unsigned int i = (unsigned int)value;
            return isUniform ? LLVMUInt16(i) : LLVMUInt16Vector(i);
        }
        case AtomicType::TYPE_INT32: {
            int i = (int)value;
            assert((double)i == value);
            return isUniform ? LLVMInt32(i) : LLVMInt32Vector(i);
        }
        case AtomicType::TYPE_UINT32: {
            unsigned int i = (unsigned int)value;
            return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i);
        }
        case AtomicType::TYPE_FLOAT:
            return isUniform ? LLVMFloat((float)value) : 
                               LLVMFloatVector((float)value);
@@ -590,14 +616,13 @@ UnaryExpr::Optimize() {
    const Type *type = constExpr->GetType();
    bool isEnumType = dynamic_cast<const EnumType *>(type) != NULL;
-    if (type == AtomicType::UniformInt64 || 
+    const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
-        type == AtomicType::VaryingInt64 ||
+    if (baseType == AtomicType::UniformInt8 ||
-        type == AtomicType::UniformUInt64 || 
+        baseType == AtomicType::UniformUInt8 ||
-        type == AtomicType::VaryingUInt64 ||
+        baseType == AtomicType::UniformInt16 ||
-        type == AtomicType::UniformConstInt64 || 
+        baseType == AtomicType::UniformUInt16 ||
-        type == AtomicType::VaryingConstInt64 ||
+        baseType == AtomicType::UniformInt64 ||
-        type == AtomicType::UniformConstUInt64 || 
+        baseType == AtomicType::UniformUInt64)
        type == AtomicType::VaryingConstUInt64)
        // FIXME: should handle these at some point; for now we only do
        // constant folding for bool, int32 and float types...
        return this;
@@ -3058,6 +3083,86 @@ MemberExpr::getCandidateNearMatches() const {
 ///////////////////////////////////////////////////////////////////////////
 // ConstExpr
 ConstExpr::ConstExpr(const Type *t, int8_t i, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstInt8);
    int8Val[0] = i;
 }
 ConstExpr::ConstExpr(const Type *t, int8_t *i, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstInt8 || 
           type == AtomicType::VaryingConstInt8);
    for (int j = 0; j < Count(); ++j)
        int8Val[j] = i[j];
 }
 ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformUInt8);
    uint8Val[0] = u;
 }
 ConstExpr::ConstExpr(const Type *t, uint8_t *u, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstUInt8 || 
           type == AtomicType::VaryingConstUInt8);
    for (int j = 0; j < Count(); ++j)
        uint8Val[j] = u[j];
 }
 ConstExpr::ConstExpr(const Type *t, int16_t i, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstInt16);
    int16Val[0] = i;
 }
 ConstExpr::ConstExpr(const Type *t, int16_t *i, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstInt16 || 
           type == AtomicType::VaryingConstInt16);
    for (int j = 0; j < Count(); ++j)
        int16Val[j] = i[j];
 }
 ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformUInt16);
    uint16Val[0] = u;
 }
 ConstExpr::ConstExpr(const Type *t, uint16_t *u, SourcePos p) 
  : Expr(p) {
    type = t;
    type = type->GetAsConstType();
    assert(type == AtomicType::UniformConstUInt16 || 
           type == AtomicType::VaryingConstUInt16);
    for (int j = 0; j < Count(); ++j)
        uint16Val[j] = u[j];
 }
 ConstExpr::ConstExpr(const Type *t, int32_t i, SourcePos p) 
  : Expr(p) {
    type = t;
@@ -3212,6 +3317,22 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v)
        for (int i = 0; i < Count(); ++i)
            boolVal[i] = (v[i] != 0.);
        break;
    case AtomicType::TYPE_INT8:
        for (int i = 0; i < Count(); ++i)
            int8Val[i] = (int)v[i];
        break;
    case AtomicType::TYPE_UINT8:
        for (int i = 0; i < Count(); ++i)
            uint8Val[i] = (unsigned int)v[i];
        break;
    case AtomicType::TYPE_INT16:
        for (int i = 0; i < Count(); ++i)
            int16Val[i] = (int)v[i];
        break;
    case AtomicType::TYPE_UINT16:
        for (int i = 0; i < Count(); ++i)
            uint16Val[i] = (unsigned int)v[i];
        break;
    case AtomicType::TYPE_INT32:
        for (int i = 0; i < Count(); ++i)
            int32Val[i] = (int)v[i];
@@ -3270,6 +3391,18 @@ ConstExpr::GetValue(FunctionEmitContext *ctx) const {
            return LLVMBoolVector(boolVal);
        else
            return boolVal[0] ? LLVMTrue : LLVMFalse;
    case AtomicType::TYPE_INT8:
        return isVarying ? LLVMInt8Vector(int8Val) : 
                           LLVMInt8(int8Val[0]);
    case AtomicType::TYPE_UINT8:
        return isVarying ? LLVMUInt8Vector(uint8Val) : 
                           LLVMUInt8(uint8Val[0]);
    case AtomicType::TYPE_INT16:
        return isVarying ? LLVMInt16Vector(int16Val) : 
                           LLVMInt16(int16Val[0]);
    case AtomicType::TYPE_UINT16:
        return isVarying ? LLVMUInt16Vector(uint16Val) : 
                           LLVMUInt16(uint16Val[0]);
    case AtomicType::TYPE_INT32:
        return isVarying ? LLVMInt32Vector(int32Val) : 
                           LLVMInt32(int32Val[0]);
@@ -3351,6 +3484,10 @@ int
 ConstExpr::AsInt64(int64_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
@@ -3368,6 +3505,10 @@ int
 ConstExpr::AsUInt64(uint64_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
@@ -3385,6 +3526,10 @@ int
 ConstExpr::AsDouble(double *d, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   d, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   d, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  d, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  d, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, d, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  d, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, d, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  d, Count(), forceVarying); break;
@@ -3402,6 +3547,10 @@ int
 ConstExpr::AsFloat(float *fp, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   fp, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   fp, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  fp, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  fp, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, fp, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  fp, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, fp, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  fp, Count(), forceVarying); break;
@@ -3419,6 +3568,10 @@ int
 ConstExpr::AsBool(bool *b, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   b, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   b, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  b, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  b, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, b, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  b, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, b, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  b, Count(), forceVarying); break;
@@ -3432,10 +3585,98 @@ ConstExpr::AsBool(bool *b, bool forceVarying) const {
 }
 int
 ConstExpr::AsInt8(int8_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
    default:
        FATAL("unimplemented const type");
    }
    return Count();
 }
 int
 ConstExpr::AsUInt8(uint8_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
    default:
        FATAL("unimplemented const type");
    }
    return Count();
 }
 int
 ConstExpr::AsInt16(int16_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
    default:
        FATAL("unimplemented const type");
    }
    return Count();
 }
 int
 ConstExpr::AsUInt16(uint16_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
    default:
        FATAL("unimplemented const type");
    }
    return Count();
 }
 int
 ConstExpr::AsInt32(int32_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
@@ -3453,6 +3694,10 @@ int
 ConstExpr::AsUInt32(uint32_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
@@ -3488,6 +3733,40 @@ ConstExpr::GetConstant(const Type *type) const {
        else
            return LLVMBoolVector(bv);
    }
    else if (type == AtomicType::UniformInt8 || type == AtomicType::VaryingInt8) {
        int8_t iv[ISPC_MAX_NVEC];
        AsInt8(iv, type->IsVaryingType());
        if (type->IsUniformType())
            return LLVMInt8(iv[0]);
        else
            return LLVMInt8Vector(iv);
    }
    else if (type == AtomicType::UniformUInt8 || type == AtomicType::VaryingUInt8 ||
             dynamic_cast<const EnumType *>(type) != NULL) {
        uint8_t uiv[ISPC_MAX_NVEC];
        AsUInt8(uiv, type->IsVaryingType());
        if (type->IsUniformType())
            return LLVMUInt8(uiv[0]);
        else
            return LLVMUInt8Vector(uiv);
    }
    else if (type == AtomicType::UniformInt16 || type == AtomicType::VaryingInt16) {
        int16_t iv[ISPC_MAX_NVEC];
        AsInt16(iv, type->IsVaryingType());
        if (type->IsUniformType())
            return LLVMInt16(iv[0]);
        else
            return LLVMInt16Vector(iv);
    }
    else if (type == AtomicType::UniformUInt16 || type == AtomicType::VaryingUInt16 ||
             dynamic_cast<const EnumType *>(type) != NULL) {
        uint16_t uiv[ISPC_MAX_NVEC];
        AsUInt16(uiv, type->IsVaryingType());
        if (type->IsUniformType())
            return LLVMUInt16(uiv[0]);
        else
            return LLVMUInt16Vector(uiv);
    }
    else if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) {
        int32_t iv[ISPC_MAX_NVEC];
        AsInt32(iv, type->IsVaryingType());
@@ -3564,6 +3843,18 @@ ConstExpr::Print() const {
        case AtomicType::TYPE_BOOL:
            printf("%s", boolVal[i] ? "true" : "false");
            break;
        case AtomicType::TYPE_INT8:
            printf("%d", (int)int8Val[i]);
            break;
        case AtomicType::TYPE_UINT8:
            printf("%u", (int)uint8Val[i]);
            break;
        case AtomicType::TYPE_INT16:
            printf("%d", (int)int16Val[i]);
            break;
        case AtomicType::TYPE_UINT16:
            printf("%u", (int)uint16Val[i]);
            break;
        case AtomicType::TYPE_INT32:
            printf("%d", int32Val[i]);
            break;
@@ -3637,11 +3928,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "bool2float");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_INT64:
            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int to float
                                 exprVal, targetType, "int2float");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
            if (fromType->IsVaryingType())
@@ -3675,11 +3970,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                 exprVal, targetType, "bool2double");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_INT64:
            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int
                                 exprVal, targetType, "int2double");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
            if (fromType->IsVaryingType())
@@ -3699,6 +3998,170 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        }
        break;
    }
    case AtomicType::TYPE_INT8: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int8Type :
                                        LLVMTypes::Int8VectorType;
        switch (fromType->basicType) {
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_UINT8:
            cast = exprVal;
            break;
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_INT64:
        case AtomicType::TYPE_UINT64:
            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int8");
            break;
        case AtomicType::TYPE_FLOAT:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
                                 exprVal, targetType, "float2int");
            break;
        case AtomicType::TYPE_DOUBLE:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
                                 exprVal, targetType, "double2int");
            break;
        default:
            FATAL("unimplemented");
        }
        break;
    }
    case AtomicType::TYPE_UINT8: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int8Type :
                                        LLVMTypes::Int8VectorType;
        switch (fromType->basicType) {
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_UINT8:
            cast = exprVal;
            break;
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_INT64:
        case AtomicType::TYPE_UINT64:
            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint8");
            break;
        case AtomicType::TYPE_FLOAT:
            if (fromType->IsVaryingType())
                PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
                                   "Use \"int\" if possible");
            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
                                 exprVal, targetType, "float2uint");
            break;
        case AtomicType::TYPE_DOUBLE:
            if (fromType->IsVaryingType())
                PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
                                   "Use \"int\" if possible");
            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
                                 exprVal, targetType, "double2uint");
            break;
        default:
            FATAL("unimplemented");
        }
        break;
    }
    case AtomicType::TYPE_INT16: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int16Type :
                                        LLVMTypes::Int16VectorType;
        switch (fromType->basicType) {
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
            break;
        case AtomicType::TYPE_INT8:
            cast = ctx->SExtInst(exprVal, targetType, "int2int16");
            break;
        case AtomicType::TYPE_UINT8:
            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
            break;
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_UINT16:
            cast = exprVal;
            break;
        case AtomicType::TYPE_FLOAT:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
                                 exprVal, targetType, "float2int");
            break;
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_INT64:
        case AtomicType::TYPE_UINT64:
            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int16");
            break;
        case AtomicType::TYPE_DOUBLE:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
                                 exprVal, targetType, "double2int");
            break;
        default:
            FATAL("unimplemented");
        }
        break;
    }
    case AtomicType::TYPE_UINT16: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int16Type :
                                        LLVMTypes::Int16VectorType;
        switch (fromType->basicType) {
        case AtomicType::TYPE_BOOL:
            if (fromType->IsVaryingType() && 
                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint16");
            break;
        case AtomicType::TYPE_INT8:
            cast = ctx->SExtInst(exprVal, targetType, "uint2uint16");
            break;
        case AtomicType::TYPE_UINT8:
            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
            break;            
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_UINT16:
            cast = exprVal;
            break;
        case AtomicType::TYPE_FLOAT:
            if (fromType->IsVaryingType())
                PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
                                   "Use \"int\" if possible");
            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
                                 exprVal, targetType, "float2uint");
            break;
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_INT64:
        case AtomicType::TYPE_UINT64:
            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint16");
            break;
        case AtomicType::TYPE_DOUBLE:
            if (fromType->IsVaryingType())
                PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
                                   "Use \"int\" if possible");
            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
                                 exprVal, targetType, "double2uint");
            break;
        default:
            FATAL("unimplemented");
        }
        break;
    }
    case AtomicType::TYPE_INT32: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int32Type :
@@ -3710,6 +4173,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
            cast = ctx->SExtInst(exprVal, targetType, "int2int32");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint32");
            break;
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
            cast = exprVal;
@@ -3742,6 +4213,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
            cast = ctx->SExtInst(exprVal, targetType, "uint2uint");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint");
            break;            
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
            cast = exprVal;
@@ -3780,11 +4259,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int64");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int32_to_int64");
+            cast = ctx->SExtInst(exprVal, targetType, "int_to_int64");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_int64");
+            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_int64");
            break;
        case AtomicType::TYPE_FLOAT:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
@@ -3796,7 +4279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            break;
        case AtomicType::TYPE_DOUBLE:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int");
+                                 exprVal, targetType, "double2int64");
            break;
        default:
            FATAL("unimplemented");
@@ -3814,11 +4297,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int32_to_uint64");
+            cast = ctx->SExtInst(exprVal, targetType, "int_to_uint64");
            break;
        case AtomicType::TYPE_UINT8:
        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_uint64");
+            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_uint64");
            break;
        case AtomicType::TYPE_FLOAT:
            if (fromType->IsVaryingType())
@@ -3848,6 +4335,22 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            cast = exprVal;
            break;
        case AtomicType::TYPE_INT8:
        case AtomicType::TYPE_UINT8: {
            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt8(0) : 
                (llvm::Value *)LLVMInt8Vector((int8_t)0);
            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
                                exprVal, zero, "cmpi0");
            break;
        }
        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_UINT16: {
            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt16(0) : 
                (llvm::Value *)LLVMInt16Vector((int16_t)0);
            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
                                exprVal, zero, "cmpi0");
            break;
        }
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32: {
            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt32(0) : 
@@ -4195,6 +4698,26 @@ TypeCastExpr::Optimize() {
        constExpr->AsBool(bv, forceVarying);
        return new ConstExpr(toType, bv, pos);
    }
    case AtomicType::TYPE_INT8: {
        int8_t iv[ISPC_MAX_NVEC];
        constExpr->AsInt8(iv, forceVarying);
        return new ConstExpr(toType, iv, pos);
    }
    case AtomicType::TYPE_UINT8: {
        uint8_t uv[ISPC_MAX_NVEC];
        constExpr->AsUInt8(uv, forceVarying);
        return new ConstExpr(toType, uv, pos);
    }
    case AtomicType::TYPE_INT16: {
        int16_t iv[ISPC_MAX_NVEC];
        constExpr->AsInt16(iv, forceVarying);
        return new ConstExpr(toType, iv, pos);
    }
    case AtomicType::TYPE_UINT16: {
        uint16_t uv[ISPC_MAX_NVEC];
        constExpr->AsUInt16(uv, forceVarying);
        return new ConstExpr(toType, uv, pos);
    }
    case AtomicType::TYPE_INT32: {
        int32_t iv[ISPC_MAX_NVEC];
        constExpr->AsInt32(iv, forceVarying);
--- a/expr.h
+++ b/expr.h
@@ -325,6 +325,24 @@ private:
 */
 class ConstExpr : public Expr {
 public:
    /** Create a ConstExpr from a uniform int8 value */
    ConstExpr(const Type *t, int8_t i, SourcePos p);
    /** Create a ConstExpr from a varying int8 value */
    ConstExpr(const Type *t, int8_t *i, SourcePos p);
    /** Create a ConstExpr from a uniform uint8 value */
    ConstExpr(const Type *t, uint8_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint8 value */
    ConstExpr(const Type *t, uint8_t *u, SourcePos p);
    /** Create a ConstExpr from a uniform int16 value */
    ConstExpr(const Type *t, int16_t i, SourcePos p);
    /** Create a ConstExpr from a varying int16 value */
    ConstExpr(const Type *t, int16_t *i, SourcePos p);
    /** Create a ConstExpr from a uniform uint16 value */
    ConstExpr(const Type *t, uint16_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint16 value */
    ConstExpr(const Type *t, uint16_t *u, SourcePos p);
    /** Create a ConstExpr from a uniform int32 value */
    ConstExpr(const Type *t, int32_t i, SourcePos p);
    /** Create a ConstExpr from a varying int32 value */
@@ -333,14 +351,17 @@ public:
    ConstExpr(const Type *t, uint32_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint32 value */
    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
    /** Create a ConstExpr from a uniform float value */
    ConstExpr(const Type *t, float f, SourcePos p);
    /** Create a ConstExpr from a varying float value */
    ConstExpr(const Type *t, float *f, SourcePos p);
    /** Create a ConstExpr from a uniform double value */
    ConstExpr(const Type *t, double d, SourcePos p);
    /** Create a ConstExpr from a varying double value */
    ConstExpr(const Type *t, double *d, SourcePos p);
    /** Create a ConstExpr from a uniform int64 value */
    ConstExpr(const Type *t, int64_t i, SourcePos p);
    /** Create a ConstExpr from a varying int64 value */
@@ -349,10 +370,12 @@ public:
    ConstExpr(const Type *t, uint64_t i, SourcePos p);
    /** Create a ConstExpr from a varying uint64 value */
    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
    /** Create a ConstExpr from a uniform bool value */
    ConstExpr(const Type *t, bool b, SourcePos p);
    /** Create a ConstExpr from a varying bool value */
    ConstExpr(const Type *t, bool *b, SourcePos p);
    /** Create a ConstExpr of the same type as the given old ConstExpr,
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);
@@ -371,6 +394,30 @@ public:
        equal to the target vector width into the given pointer. */
    int AsBool(bool *, bool forceVarying = false) const;
    /** Return the ConstExpr's values as int8s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
        equal to the target vector width into the given pointer. */
    int AsInt8(int8_t *, bool forceVarying = false) const;
    /** Return the ConstExpr's values as uint8s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
        equal to the target vector width into the given pointer. */
    int AsUInt8(uint8_t *, bool forceVarying = false) const;
    /** Return the ConstExpr's values as int16s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
        equal to the target vector width into the given pointer. */
    int AsInt16(int16_t *, bool forceVarying = false) const;
    /** Return the ConstExpr's values as uint16s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
        equal to the target vector width into the given pointer. */
    int AsUInt16(uint16_t *, bool forceVarying = false) const;
    /** Return the ConstExpr's values as int32s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
@@ -417,6 +464,10 @@ private:
    const Type *type;
    union {
        int8_t int8Val[ISPC_MAX_NVEC];
        uint8_t uint8Val[ISPC_MAX_NVEC];
        int16_t int16Val[ISPC_MAX_NVEC];
        uint16_t uint16Val[ISPC_MAX_NVEC];
        int32_t int32Val[ISPC_MAX_NVEC];
        uint32_t uint32Val[ISPC_MAX_NVEC];
        bool boolVal[ISPC_MAX_NVEC];
--- a/failing_tests/shuffle2-10.ispc
+++ b/failing_tests/shuffle2-10.ispc
@@ -0,0 +1,16 @@
 /* failing due to llvm bug http://llvm.org/bugs/show_bug.cgi?id=10421 */
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 aa = aFOO[programIndex]; 
    int8 bb = aa + programCount;
    int8 shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
 //CO    print("%\n%\n%\n%\n", aa, bb, 2*programIndex+(int)b-5, shuf);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1 + 2*programIndex;
 }
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -158,38 +158,40 @@ static bool lRunTest(const char *fn) {
    }
    llvm::Function *func;
-    if ((func = module->getFunction("ISPCLaunch")) != NULL)
+#define DO_FUNC(FUNC ,FUNCNAME)                           \
-        ee->addGlobalMapping(func, (void *)ISPCLaunch);
+    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
-    if ((func = module->getFunction("ISPCSync")) != NULL)
+        ee->addGlobalMapping(func, (void *)FUNC)
-        ee->addGlobalMapping(func, (void *)ISPCSync);
+    DO_FUNC(ISPCLaunch, "ISPCLaunch");
    DO_FUNC(ISPCSync, "ISPCSync");
 #ifdef ISPC_IS_WINDOWS
-    if ((func = module->getFunction("ISPCMalloc")) != NULL)
+    DO_FUNC(ISPCMalloc, "ISPCMalloc");
-        ee->addGlobalMapping(func, (void *)ISPCMalloc);
+    DO_FUNC(ISPCFree, "ISPCFree");
    if ((func = module->getFunction("ISPCFree")) != NULL)
        ee->addGlobalMapping(func, (void *)ISPCFree);
 #endif // ISPC_IS_WINDOWS
-    if ((func = module->getFunction("putchar")) != NULL)
+    DO_FUNC(putchar, "putchar");
-        ee->addGlobalMapping(func, (void *)putchar);
+    DO_FUNC(printf, "printf");
-    if ((func = module->getFunction("printf")) != NULL)
+    DO_FUNC(fflush, "fflush");
-        ee->addGlobalMapping(func, (void *)printf);
+    DO_FUNC(sinf, "sinf");
-    if ((func = module->getFunction("fflush")) != NULL)
+    DO_FUNC(cosf, "cosf");
-        ee->addGlobalMapping(func, (void *)fflush);
+    DO_FUNC(tanf, "tanf");
-    if ((func = module->getFunction("sinf")) != NULL)
+    DO_FUNC(atanf, "atanf");
-        ee->addGlobalMapping(func, (void *)sinf);
+    DO_FUNC(atan2f, "atan2f");
-    if ((func = module->getFunction("cosf")) != NULL)
+    DO_FUNC(powf, "powf");
-        ee->addGlobalMapping(func, (void *)cosf);
+    DO_FUNC(expf, "expf");
-    if ((func = module->getFunction("tanf")) != NULL)
+    DO_FUNC(logf, "logf");
-        ee->addGlobalMapping(func, (void *)tanf);
+    DO_FUNC(sin, "sin");
-    if ((func = module->getFunction("atanf")) != NULL)
+    DO_FUNC(cos, "cos");
-        ee->addGlobalMapping(func, (void *)atanf);
+    DO_FUNC(tan, "tan");
-    if ((func = module->getFunction("atan2f")) != NULL)
+    DO_FUNC(atan, "atan");
-        ee->addGlobalMapping(func, (void *)atan2f);
+    DO_FUNC(atan2, "atan2");
-    if ((func = module->getFunction("powf")) != NULL)
+    DO_FUNC(pow, "pow");
-        ee->addGlobalMapping(func, (void *)powf);
+    DO_FUNC(exp, "exp");
-    if ((func = module->getFunction("expf")) != NULL)
+    DO_FUNC(log, "log");
-        ee->addGlobalMapping(func, (void *)expf);
+    DO_FUNC(memset, "memset");
-    if ((func = module->getFunction("logf")) != NULL)
+#ifdef ISPC_IS_APPLE
-        ee->addGlobalMapping(func, (void *)logf);
+    DO_FUNC(memset_pattern4, "memset_pattern4");
    DO_FUNC(memset_pattern8, "memset_pattern8");
    DO_FUNC(memset_pattern16, "memset_pattern16");
 #endif
 #ifdef ISPC_HAVE_SVML
 #define DO_SVML(FUNC ,FUNCNAME)                           \
--- a/lex.ll
+++ b/lex.ll
@@ -104,6 +104,8 @@ goto { return TOKEN_GOTO; }
 if { return TOKEN_IF; }
 inline { return TOKEN_INLINE; }
 int { return TOKEN_INT; }
 int8 { return TOKEN_INT8; }
 int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -41,28 +41,39 @@
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
 llvm::Constant *LLVMTrue = NULL;
@@ -75,16 +86,20 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
    LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
    LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
    LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
    LLVMTypes::Int8PointerType = llvm::PointerType::get(LLVMTypes::Int8Type, 0);
    LLVMTypes::Int16PointerType = llvm::PointerType::get(LLVMTypes::Int16Type, 0);
    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
    // Note that both the mask and bool vectors are vector of int32s
@@ -95,18 +110,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::Int1VectorType = 
        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
    LLVMTypes::Int8VectorType = 
        llvm::VectorType::get(LLVMTypes::Int8Type, target.vectorWidth);
    LLVMTypes::Int16VectorType = 
        llvm::VectorType::get(LLVMTypes::Int16Type, target.vectorWidth);
    LLVMTypes::Int32VectorType = 
        llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
    LLVMTypes::Int64VectorType = 
        llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
    LLVMTypes::FloatVectorType = 
        llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorType = 
        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
    LLVMTypes::Int8VectorPointerType = llvm::PointerType::get(LLVMTypes::Int8VectorType, 0);
    LLVMTypes::Int16VectorPointerType = llvm::PointerType::get(LLVMTypes::Int16VectorType, 0);
    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
    LLVMTypes::VoidPointerVectorType = 
        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
@@ -133,7 +156,36 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 }
-llvm::ConstantInt *LLVMInt32(int32_t ival) {
+llvm::ConstantInt *
 LLVMInt8(int8_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
                                  true /*signed*/);
 }
 llvm::ConstantInt *
 LLVMUInt8(uint8_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
                                  false /*unsigned*/);
 }
 llvm::ConstantInt *
 LLVMInt16(int16_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
                                  true /*signed*/);
 }
 llvm::ConstantInt *
 LLVMUInt16(uint16_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
                                  false /*unsigned*/);
 }
 llvm::ConstantInt *
 LLVMInt32(int32_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
                                  true /*signed*/);
 }
@@ -172,6 +224,82 @@ LLVMDouble(double dval) {
 }
 llvm::Constant *
 LLVMInt8Vector(int8_t ival) {
    llvm::Constant *v = LLVMInt8(ival);
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMInt8Vector(const int8_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMInt8(ivec[i]));
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMUInt8Vector(uint8_t ival) {
    llvm::Constant *v = LLVMUInt8(ival);
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMUInt8Vector(const uint8_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMUInt8(ivec[i]));
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMInt16Vector(int16_t ival) {
    llvm::Constant *v = LLVMInt16(ival);
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMInt16Vector(const int16_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMInt16(ivec[i]));
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMUInt16Vector(uint16_t ival) {
    llvm::Constant *v = LLVMUInt16(ival);
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMUInt16Vector(const uint16_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMUInt16(ivec[i]));
    return llvm::ConstantVector::get(vals);
 }
 llvm::Constant *
 LLVMInt32Vector(int32_t ival) {
    llvm::Constant *v = LLVMInt32(ival);
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -53,28 +53,39 @@ struct LLVMTypes {
    static LLVM_TYPE_CONST llvm::Type *VoidType;
    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
    static LLVM_TYPE_CONST llvm::Type *BoolType;
    static LLVM_TYPE_CONST llvm::Type *Int8Type;
    static LLVM_TYPE_CONST llvm::Type *Int16Type;
    static LLVM_TYPE_CONST llvm::Type *Int32Type;
    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
    static LLVM_TYPE_CONST llvm::Type *Int64Type;
    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
    static LLVM_TYPE_CONST llvm::Type *FloatType;
    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoubleType;
    static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
    static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoublePointerType;
    static LLVM_TYPE_CONST llvm::VectorType *MaskType;
    static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
    static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
    static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
 };
@@ -89,6 +100,14 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
 */
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);
 /** Returns an LLVM i8 constant of the given value */
 extern llvm::ConstantInt *LLVMInt8(int8_t i);
 /** Returns an LLVM i8 constant of the given value */
 extern llvm::ConstantInt *LLVMUInt8(uint8_t i);
 /** Returns an LLVM i16 constant of the given value */
 extern llvm::ConstantInt *LLVMInt16(int16_t i);
 /** Returns an LLVM i16 constant of the given value */
 extern llvm::ConstantInt *LLVMUInt16(uint16_t i);
 /** Returns an LLVM i32 constant of the given value */
 extern llvm::ConstantInt *LLVMInt32(int32_t i);
 /** Returns an LLVM i32 constant of the given value */
@@ -105,18 +124,35 @@ extern llvm::Constant *LLVMDouble(double f);
 /** Returns an LLVM boolean vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMBoolVector(bool v);
 /** Returns an LLVM i8 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt8Vector(int8_t i);
 /** Returns an LLVM i8 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt8Vector(uint8_t i);
 /** Returns an LLVM i16 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt16Vector(int16_t i);
 /** Returns an LLVM i16 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt16Vector(uint16_t i);
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt32Vector(int32_t i);
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt64Vector(int64_t i);
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
 /** Returns an LLVM float vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMFloatVector(float f);
@@ -127,18 +163,35 @@ extern llvm::Constant *LLVMDoubleVector(double f);
 /** Returns an LLVM boolean vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMBoolVector(const bool *v);
 /** Returns an LLVM i8 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt8Vector(const int8_t *i);
 /** Returns an LLVM i8 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt8Vector(const uint8_t *i);
 /** Returns an LLVM i16 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt16Vector(const int16_t *i);
 /** Returns an LLVM i16 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt16Vector(const uint16_t *i);
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
 /** Returns an LLVM float vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMFloatVector(const float *f);
--- a/opt.cpp
+++ b/opt.cpp
@@ -409,7 +409,6 @@ IntrinsicsOpt::IntrinsicsOpt()
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
    maskInstructions.push_back(sseMovmsk);
    maskInstructions.push_back(m->module->getFunction("llvm.x86.avx.movmsk.ps"));
    maskInstructions.push_back(m->module->getFunction("llvm.x86.mic.mask16.to.int"));
    maskInstructions.push_back(m->module->getFunction("__movmsk"));
    // And all of the blend instructions
@@ -418,8 +417,6 @@ IntrinsicsOpt::IntrinsicsOpt()
        0xf, 0, 1, 2));
    blendInstructions.push_back(BlendInstruction(
        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
    blendInstructions.push_back(BlendInstruction(
        m->module->getFunction("llvm.x86.mic.blend.ps"), 0xffff, 1, 2, 0));
 }
@@ -499,8 +496,8 @@ bool
 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (!callInst)
            continue;
@@ -512,7 +509,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            // If the values are the same, then no need to blend..
            if (v[0] == v[1]) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
                                           iter, v[0]);
                modifiedAny = true;
                goto restart;
            }
@@ -524,12 +522,14 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            // otherwise the result is undefined and any value is fine,
            // ergo the defined one is an acceptable result.)
            if (lIsUndef(v[0])) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[1]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
                                           iter, v[1]);
                modifiedAny = true;
                goto restart;
            }
            if (lIsUndef(v[1])) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
                                           iter, v[0]);
                modifiedAny = true;
                goto restart;
            }
@@ -544,7 +544,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                value = v[1];
            if (value != NULL) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
                                           iter, value);
                modifiedAny = true;
                goto restart;
            }
@@ -557,7 +558,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                // with the corresponding integer mask from its elements
                // high bits.
                llvm::Value *value = LLVMInt32(mask);
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
                                           iter, value);
                modifiedAny = true;
                goto restart;
            }
@@ -653,10 +655,18 @@ lSizeOfIfKnown(const llvm::Type *type, uint64_t *size) {
        *size = 1;
        return true;
    }
    if (type == LLVMTypes::Int8VectorType) {
        *size = g->target.vectorWidth * 1;
        return true;
    }
    else if (type == LLVMTypes::Int16Type) {
        *size = 2;
        return true;
    }
    if (type == LLVMTypes::Int16VectorType) {
        *size = g->target.vectorWidth * 2;
        return true;
    }
    else if (type == LLVMTypes::FloatType || type == LLVMTypes::Int32Type) {
        *size = 4;
        return true;
@@ -978,33 +988,53 @@ lGetPtrAndOffsets(llvm::Value *ptrs, llvm::Value **basePtr,
 }
 struct GSInfo {
    GSInfo(const char *pgFuncName, const char *pgboFuncName, bool ig, int es) 
        : isGather(ig), elementSize(es) {
        func = m->module->getFunction(pgFuncName);
        baseOffsetsFunc = m->module->getFunction(pgboFuncName);
    }
    llvm::Function *func;
    llvm::Function *baseOffsetsFunc;
    const bool isGather;
    const int elementSize;
 };
 bool
 GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_32");
+    GSInfo gsFuncs[] = {
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_64");
+        GSInfo("__pseudo_gather_8",  "__pseudo_gather_base_offsets_8",  true, 1),
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_32");
+        GSInfo("__pseudo_gather_16", "__pseudo_gather_base_offsets_16", true, 2),
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_64");
+        GSInfo("__pseudo_gather_32", "__pseudo_gather_base_offsets_32", true, 4),
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+        GSInfo("__pseudo_gather_64", "__pseudo_gather_base_offsets_64", true, 8),
        GSInfo("__pseudo_scatter_8",  "__pseudo_scatter_base_offsets_8",  false, 1),
        GSInfo("__pseudo_scatter_16", "__pseudo_scatter_base_offsets_16", false, 2),
        GSInfo("__pseudo_scatter_32", "__pseudo_scatter_base_offsets_32", false, 4),
        GSInfo("__pseudo_scatter_64", "__pseudo_scatter_base_offsets_64", false, 8),
    };
    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
    for (int i = 0; i < numGSFuncs; ++i)
        assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL);
    bool modifiedAny = false;
 restart:
    // Iterate through all of the instructions in the basic block.
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        // If we don't have a call to one of the
        // __pseudo_{gather,scatter}_* functions, then just go on to the
        // next instruction.
-        if (!callInst ||
+        if (callInst == NULL)
-            (callInst->getCalledFunction() != gather32Func &&
+            continue;
-             callInst->getCalledFunction() != gather64Func &&
+        GSInfo *info = NULL;
-             callInst->getCalledFunction() != scatter32Func &&
+        for (int i = 0; i < numGSFuncs; ++i)
-             callInst->getCalledFunction() != scatter64Func))
+            if (callInst->getCalledFunction() == gsFuncs[i].func) {
                info = &gsFuncs[i];
                break;
            }
        if (info == NULL)
            continue;
        bool isGather = (callInst->getCalledFunction() == gather32Func ||
                         callInst->getCalledFunction() == gather64Func);
        bool is32 = (callInst->getCalledFunction() == gather32Func ||
                     callInst->getCalledFunction() == scatter32Func);
        // Transform the array of pointers to a single base pointer and an
        // array of int32 offsets.  (All the hard work is done by
@@ -1012,19 +1042,15 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *ptrs = callInst->getArgOperand(0);
        llvm::Value *basePtr = NULL;
        llvm::Value *offsetVector = lGetPtrAndOffsets(ptrs, &basePtr, callInst, 
-                                                      is32 ? 4 : 8);
+                                                      info->elementSize);
        // Cast the base pointer to a void *, since that's what the
        // __pseudo_*_base_offsets_* functions want.
-        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType, "base2void", 
+        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType,
-                                        callInst);
+                                        "base2void", callInst);
        lCopyMetadata(basePtr, callInst);
-        if (isGather) {
+        if (info->isGather) {
            llvm::Value *mask = callInst->getArgOperand(1);
            llvm::Function *gFunc = 
                m->module->getFunction(is32 ? "__pseudo_gather_base_offsets_32" :
                                              "__pseudo_gather_base_offsets_64");
            assert(gFunc != NULL);
            // Generate a new function call to the next pseudo gather
            // base+offsets instruction.  Note that we're passing a NULL
@@ -1035,11 +1061,12 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, newArgArray, "newgather", 
+                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray,
-                                       (llvm::Instruction *)NULL);
+                                       "newgather", (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[3], "newgather");
+                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], &newArgs[3],
                                       "newgather");
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1047,10 +1074,6 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
        else {
            llvm::Value *mask = callInst->getArgOperand(2);
            llvm::Value *rvalue = callInst->getArgOperand(1);
            llvm::Function *gFunc = 
                m->module->getFunction(is32 ? "__pseudo_scatter_base_offsets_32" :
                                              "__pseudo_scatter_base_offsets_64");
            assert(gFunc);
            // Generate a new function call to the next pseudo scatter
            // base+offsets instruction.  See above for why passing NULL
@@ -1059,11 +1082,12 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, newArgArray, "", 
+                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray, "", 
                                       (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[4]);
+                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], 
                                       &newArgs[4]);
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1105,28 +1129,53 @@ char MaskedStoreOptPass::ID = 0;
 llvm::RegisterPass<MaskedStoreOptPass> mss("masked-store-scalarize",
                                           "Masked Store Scalarize Pass");
 struct MSInfo {
    MSInfo(const char *name, const int a) 
        : align(a) {
        func = m->module->getFunction(name);
        assert(func != NULL);
    }
    llvm::Function *func;
    const int align;
 };
 bool
 MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *pms32Func = m->module->getFunction("__pseudo_masked_store_32");
+    MSInfo msInfo[] = {
-    llvm::Function *pms64Func = m->module->getFunction("__pseudo_masked_store_64");
+        MSInfo("__pseudo_masked_store_8",  1),
-    llvm::Function *msb32Func = m->module->getFunction("__masked_store_blend_32");
+        MSInfo("__pseudo_masked_store_16", 2),
-    llvm::Function *msb64Func = m->module->getFunction("__masked_store_blend_64");
+        MSInfo("__pseudo_masked_store_32", 4),
-    llvm::Function *ms32Func = m->module->getFunction("__masked_store_32");
+        MSInfo("__pseudo_masked_store_64", 8),
-    llvm::Function *ms64Func = m->module->getFunction("__masked_store_64");
+        MSInfo("__masked_store_blend_8",  1),
        MSInfo("__masked_store_blend_16", 2),
        MSInfo("__masked_store_blend_32", 4),
        MSInfo("__masked_store_blend_64", 8),
        MSInfo("__masked_store_8",  1),
        MSInfo("__masked_store_16", 2),
        MSInfo("__masked_store_32", 4),
        MSInfo("__masked_store_64", 8)
    };
    bool modifiedAny = false;
 restart:
    // Iterate over all of the instructions to look for one of the various
    // masked store functions
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (!callInst)
            continue;
        llvm::Function *called = callInst->getCalledFunction();
-        if (called != pms32Func && called != pms64Func &&
+        int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
-            called != msb32Func && called != msb64Func &&
+        MSInfo *info = NULL;
-            called != ms32Func  && called != ms64Func)
+        for (int i = 0; i < nMSFuncs; ++i) {
            if (called == msInfo[i].func) {
                info = &msInfo[i];
                break;
            }
        }
        if (info == NULL)
            continue;
        // Got one; grab the operands
@@ -1150,15 +1199,12 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            LLVM_TYPE_CONST llvm::Type *rvalueType = rvalue->getType();
            LLVM_TYPE_CONST llvm::Type *ptrType = 
                llvm::PointerType::get(rvalueType, 0);
            // Need to update this when int8/int16 are added
            int align = (called == pms32Func || called == pms64Func ||
                         called == msb32Func) ? 4 : 8;
            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
            lCopyMetadata(lvalue, callInst);
            llvm::Instruction *store = 
                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                    align);
+                                    info->align);
            lCopyMetadata(store, callInst);
            llvm::ReplaceInstWithInst(callInst, store);
@@ -1180,9 +1226,9 @@ CreateMaskedStoreOptPass() {
 // LowerMaskedStorePass
 /** When the front-end needs to do a masked store, it emits a
-    __pseudo_masked_store_{32,64} call as a placeholder.  This pass lowers
+    __pseudo_masked_store_{8,16,32,64} call as a placeholder.  This pass
-    these calls to either __masked_store_{32,64} or
+    lowers these calls to either __masked_store_{8,16,32,64} or
-    __masked_store_blend_{32,64} calls.
+    __masked_store_blend_{8,16,32,64} calls.
  */
 class LowerMaskedStorePass : public llvm::BasicBlockPass {
 public:
@@ -1227,45 +1273,51 @@ lIsStackVariablePointer(llvm::Value *lvalue) {
 }
-/** Utilty routine to figure out which masked store function to use.  The
+struct LMSInfo {
-    blend parameter indicates if we want the blending version, is32
+    LMSInfo(const char *pname, const char *bname, const char *msname) {
-    indicates if the element size is 32 bits.
+        pseudoFunc = m->module->getFunction(pname);
- */
+        blendFunc = m->module->getFunction(bname);
-static const char *
+        maskedStoreFunc = m->module->getFunction(msname);
-lMaskedStoreName(bool blend, bool is32) {
+        assert(pseudoFunc != NULL && blendFunc != NULL && 
-    if (blend) {
+               maskedStoreFunc != NULL);
        if (is32)
            return "__masked_store_blend_32";
        else
            return "__masked_store_blend_64";
    }
-    else {
+    llvm::Function *pseudoFunc;
-        if (is32)
+    llvm::Function *blendFunc;
-            return "__masked_store_32";
+    llvm::Function *maskedStoreFunc;
-        else
+};
            return "__masked_store_64";
    }
 }
 bool
 LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *maskedStore32Func = m->module->getFunction("__pseudo_masked_store_32");
+    LMSInfo msInfo[] = {
-    llvm::Function *maskedStore64Func = m->module->getFunction("__pseudo_masked_store_64");
+        LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8", 
-    assert(maskedStore32Func && maskedStore64Func);
+                "__masked_store_8"),
        LMSInfo("__pseudo_masked_store_16", "__masked_store_blend_16", 
                "__masked_store_16"),
        LMSInfo("__pseudo_masked_store_32", "__masked_store_blend_32", 
                "__masked_store_32"),
        LMSInfo("__pseudo_masked_store_64", "__masked_store_blend_64", 
                "__masked_store_64")
    };
    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Iterate through all of the instructions and look for
        // __pseudo_masked_store_* calls.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (!callInst ||
+        if (callInst == NULL)
-            (callInst->getCalledFunction() != maskedStore32Func &&
+            continue;
-             callInst->getCalledFunction() != maskedStore64Func))
+        LMSInfo *info = NULL;
        for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
            if (callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
                info = &msInfo[i];
                break;
            }
        }
        if (info == NULL)
            continue;
        bool is32 = (callInst->getCalledFunction() == maskedStore32Func);
        llvm::Value *lvalue = callInst->getArgOperand(0);
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);
@@ -1282,8 +1334,7 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
-        llvm::Function *fms = m->module->getFunction(lMaskedStoreName(doBlend, is32));
+        llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
        assert(fms);
        llvm::Value *args[3] = { lvalue, rvalue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
        llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
@@ -1872,37 +1923,94 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) {
 }
 struct GatherImpInfo {
    GatherImpInfo(const char *pName, const char *lbName, const char *lmName,
                  int a) 
        : align(a) {
        pseudoFunc = m->module->getFunction(pName);
        loadBroadcastFunc = m->module->getFunction(lbName);
        loadMaskedFunc = m->module->getFunction(lmName);
        assert(pseudoFunc != NULL && loadBroadcastFunc != NULL &&
               loadMaskedFunc != NULL);
    }
    llvm::Function *pseudoFunc;
    llvm::Function *loadBroadcastFunc;
    llvm::Function *loadMaskedFunc;
    const int align;
 };
 struct ScatterImpInfo {
    ScatterImpInfo(const char *pName, const char *msName, 
                   LLVM_TYPE_CONST llvm::Type *vpt, int a)
        : align(a) {
        pseudoFunc = m->module->getFunction(pName);
        maskedStoreFunc = m->module->getFunction(msName);
        vecPtrType = vpt;
        assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
    }
    llvm::Function *pseudoFunc;
    llvm::Function *maskedStoreFunc;
    LLVM_TYPE_CONST llvm::Type *vecPtrType;
    const int align;
 };
 bool
 GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
+    GatherImpInfo gInfo[] = {
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
+        GatherImpInfo("__pseudo_gather_base_offsets_8", "__load_and_broadcast_8",
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
+                      "__load_masked_8", 1),
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
+        GatherImpInfo("__pseudo_gather_base_offsets_16", "__load_and_broadcast_16",
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+                      "__load_masked_16", 2),
        GatherImpInfo("__pseudo_gather_base_offsets_32", "__load_and_broadcast_32",
                      "__load_masked_32", 4),
        GatherImpInfo("__pseudo_gather_base_offsets_64", "__load_and_broadcast_64",
                      "__load_masked_64", 8)
    };
    ScatterImpInfo sInfo[] = {
        ScatterImpInfo("__pseudo_scatter_base_offsets_8",  "__pseudo_masked_store_8", 
                       LLVMTypes::Int8VectorPointerType, 1),
        ScatterImpInfo("__pseudo_scatter_base_offsets_16", "__pseudo_masked_store_16",
                       LLVMTypes::Int16VectorPointerType, 2),
        ScatterImpInfo("__pseudo_scatter_base_offsets_32", "__pseudo_masked_store_32",
                       LLVMTypes::Int32VectorPointerType, 4),
        ScatterImpInfo("__pseudo_scatter_base_offsets_64", "__pseudo_masked_store_64",
                       LLVMTypes::Int64VectorPointerType, 8)
    };
    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Iterate over all of the instructions and look for calls to
        // __pseudo_*_base_offsets_* calls.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (!callInst || 
+        if (callInst == NULL)
-            (callInst->getCalledFunction() != gather32Func &&
+            continue;
-             callInst->getCalledFunction() != gather64Func &&
+        llvm::Function *calledFunc = callInst->getCalledFunction();
-             callInst->getCalledFunction() != scatter32Func &&
+        GatherImpInfo *gatherInfo = NULL;
-             callInst->getCalledFunction() != scatter64Func))
+        ScatterImpInfo *scatterInfo = NULL;
        for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
            if (calledFunc == gInfo[i].pseudoFunc) {
                gatherInfo = &gInfo[i];
                break;
            }
        }
        for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
            if (calledFunc == sInfo[i].pseudoFunc) {
                scatterInfo = &sInfo[i];
                break;
            }
        }
        if (gatherInfo == NULL && scatterInfo == NULL)
            continue;
        SourcePos pos;
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
        assert(ok);     
        bool isGather = (callInst->getCalledFunction() == gather32Func ||
                         callInst->getCalledFunction() == gather64Func);
        bool is32 = (callInst->getCalledFunction() == gather32Func ||
                     callInst->getCalledFunction() == scatter32Func);
        // Get the actual base pointer; note that it comes into the gather
        // or scatter function bitcast to an i8 *, so we need to work back
        // to get the pointer as the original type.
@@ -1921,7 +2029,7 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
            continue;
-        llvm::Value *mask = callInst->getArgOperand(isGather ? 2 : 3);
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
        if (lVectorValuesAllEqual(offsetElements)) {
            // If all the offsets are equal, then compute the single
@@ -1929,14 +2037,15 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // (arbitrarily).
            llvm::Value *indices[1] = { offsetElements[0] };
            llvm::Value *basei8 =
-                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
+                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType,
                                      "base2i8", callInst);
            lCopyMetadata(basei8, callInst);
            llvm::Value *ptr = 
                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
                                                "ptr", callInst);
            lCopyMetadata(ptr, callInst);
-            if (isGather) {
+            if (gatherInfo != NULL) {
                // A gather with everyone going to the same location is
                // handled as a scalar load and broadcast across the lanes.
                // Note that we do still have to pass the mask to the
@@ -1944,20 +2053,16 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                // access memory if the mask is all off (the location may
                // be invalid in that case).
                Debug(pos, "Transformed gather to scalar load and broadcast!");
                llvm::Function *loadBroadcast = 
                    m->module->getFunction(is32 ? "__load_and_broadcast_32" :
                                                  "__load_and_broadcast_64");
                assert(loadBroadcast);
                llvm::Value *args[2] = { ptr, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadBroadcast, newArgArray,
+                    llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, newArgArray,
                                           "load_broadcast", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadBroadcast, &args[0], &args[2],
+                    llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, &args[0], 
-                                           "load_broadcast");
+                                           &args[2], "load_broadcast");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1977,8 +2082,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
                                            "ptr2rvalue_type", callInst);
                lCopyMetadata(ptr, callInst);
-                llvm::Instruction *sinst = 
+                llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, 
-                    new llvm::StoreInst(first, ptr, false, is32 ? 4 : 8 /* align */);
+                                                               scatterInfo->align);
                lCopyMetadata(sinst, callInst);
                llvm::ReplaceInstWithInst(callInst, sinst);
            }
@@ -1987,7 +2092,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            goto restart;
        }
-        if (lVectorIsLinear(offsetElements, is32 ? 4 : 8)) {
+        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
        if (lVectorIsLinear(offsetElements, step)) {
            // We have a linear sequence of memory locations being accessed
            // starting with the location given by the offset from
            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
@@ -2003,53 +2109,38 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                "ptr", callInst);
            lCopyMetadata(ptr, callInst);
-            if (isGather) {
+            if (gatherInfo != NULL) {
                Debug(pos, "Transformed gather to unaligned vector load!");
                // FIXME: make this an aligned load when possible..
                // FIXME: are there lurking potential bugs when e.g. the
                // last few entries of the mask are off and the load ends
                // up straddling a page boundary?
                llvm::Function *loadMasked = 
                    m->module->getFunction(is32 ? "__load_masked_32" : "__load_masked_64");
                assert(loadMasked);
                llvm::Value *args[2] = { ptr, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadMasked, argArray, "load_masked",
+                    llvm::CallInst::Create(gatherInfo->loadMaskedFunc, argArray, 
-                                           (llvm::Instruction *)NULL);
+                                           "load_masked", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadMasked, &args[0], &args[2], "load_masked");
+                    llvm::CallInst::Create(gatherInfo->loadMaskedFunc, &args[0],
                                           &args[2], "load_masked");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
            }
            else {
                Debug(pos, "Transformed scatter to unaligned vector store!");
                // FIXME: make this an aligned store when possible.  Need
                // to work through the messiness of issuing a pseudo store
                // here.
                llvm::Value *rvalue = callInst->getArgOperand(2);
-
+                ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
-                llvm::Function *storeMasked = 
+                                            callInst);
                    m->module->getFunction(is32 ? "__pseudo_masked_store_32" :
                                                  "__pseudo_masked_store_64");
                assert(storeMasked);
                LLVM_TYPE_CONST llvm::Type *vecPtrType = is32 ?
                    LLVMTypes::Int32VectorPointerType : LLVMTypes::Int64VectorPointerType;
                ptr = new llvm::BitCastInst(ptr, vecPtrType, "ptrcast", callInst);
                llvm::Value *args[3] = { ptr, rvalue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(storeMasked, argArray, "",
+                    llvm::CallInst::Create(scatterInfo->maskedStoreFunc, argArray,
-                                           (llvm::Instruction *)NULL);
+                                           "", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(storeMasked, &args[0], &args[3], "");
+                    llvm::CallInst::Create(scatterInfo->maskedStoreFunc,
                                           &args[0], &args[3], "");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
@@ -2097,31 +2188,50 @@ char LowerGSPass::ID = 0;
 llvm::RegisterPass<LowerGSPass> lgs("lower-gs",
                                    "Lower Gather/Scatter Pass");
 struct LowerGSInfo {
    LowerGSInfo(const char *pName, const char *aName, bool ig)
        : isGather(ig) {
        pseudoFunc = m->module->getFunction(pName);
        actualFunc = m->module->getFunction(aName);
        assert(pseudoFunc != NULL && actualFunc != NULL);
    }
    llvm::Function *pseudoFunc;
    llvm::Function *actualFunc;
    const bool isGather;
 };
 bool
 LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
+    LowerGSInfo lgsInfo[] = {
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
+        LowerGSInfo("__pseudo_gather_base_offsets_8",  "__gather_base_offsets_i8",  true),
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
+        LowerGSInfo("__pseudo_gather_base_offsets_16", "__gather_base_offsets_i16", true),
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
+        LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+        LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
        LowerGSInfo("__pseudo_scatter_base_offsets_8",  "__scatter_base_offsets_i8",  false),
        LowerGSInfo("__pseudo_scatter_base_offsets_16", "__scatter_base_offsets_i16", false),
        LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false),
        LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false)
    };
    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Loop over the instructions and find calls to the
        // __pseudo_*_base_offsets_* functions.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
-        if (!callInst || 
+        if (callInst == NULL)
-            (callInst->getCalledFunction() != gather32Func &&
+            continue;
-             callInst->getCalledFunction() != gather64Func &&
+        llvm::Function *calledFunc = callInst->getCalledFunction();
-             callInst->getCalledFunction() != scatter32Func &&
+        LowerGSInfo *info = NULL;
-             callInst->getCalledFunction() != scatter64Func))
+        for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
            if (calledFunc == lgsInfo[i].pseudoFunc) {
                info = &lgsInfo[i];
                break;
            }
        }
        if (info == NULL)
            continue;
        bool isGather = (callInst->getCalledFunction() == gather32Func ||
                         callInst->getCalledFunction() == gather64Func);
        bool is32 = (callInst->getCalledFunction() == gather32Func ||
                     callInst->getCalledFunction() == scatter32Func);
        // Get the source position from the metadata attached to the call
        // instruction so that we can issue PerformanceWarning()s below.
@@ -2129,20 +2239,11 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
        assert(ok);     
-        if (isGather) {
+        callInst->setCalledFunction(info->actualFunc);
-            llvm::Function *gFunc = m->module->getFunction(is32 ? "__gather_base_offsets_i32" :
+        if (info->isGather)
                                                                  "__gather_base_offsets_i64");
            assert(gFunc);
            callInst->setCalledFunction(gFunc);
            PerformanceWarning(pos, "Gather required to compute value in expression.");
-        }
+        else
        else {
            llvm::Function *sFunc = m->module->getFunction(is32 ? "__scatter_base_offsets_i32" :
                                                                  "__scatter_base_offsets_i64");
            assert(sFunc);
            callInst->setCalledFunction(sFunc);
            PerformanceWarning(pos, "Scatter required for storing value.");
        }
        modifiedAny = true;
        goto restart;
    }
@@ -2286,25 +2387,41 @@ char MakeInternalFuncsStaticPass::ID = 0;
 llvm::RegisterPass<MakeInternalFuncsStaticPass> 
  mifsp("make-internal-funcs-static", "Make Internal Funcs Static Pass");
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print", "__gather_base_offsets_i32", "__gather_base_offsets_i64",
+        "__do_print",
-        "__gather_elt_32", "__gather_elt_64", "__load_and_broadcast_32", 
+        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
-        "__load_and_broadcast_64", "__load_masked_32", "__load_masked_64",
+        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
-        "__masked_store_32", "__masked_store_64", "__masked_store_blend_32",
+        "__gather_elt_8", "__gather_elt_16", 
-        "__masked_store_blend_64", "__packed_load_active", "__packed_store_active",
+        "__gather_elt_32", "__gather_elt_64", 
-        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64", "__scatter_elt_32",
+        "__load_and_broadcast_8", "__load_and_broadcast_16",
-        "__scatter_elt_64", };
+        "__load_and_broadcast_32", "__load_and_broadcast_64",
        "__load_masked_8", "__load_masked_16",
        "__load_masked_32", "__load_masked_64",
        "__masked_store_8", "__masked_store_16",
        "__masked_store_32", "__masked_store_64",
        "__masked_store_blend_8", "__masked_store_blend_16",
        "__masked_store_blend_32", "__masked_store_blend_64",
        "__packed_load_active", "__packed_store_active",
        "__scatter_base_offsets_i8", "__scatter_base_offsets_i16",
        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64",
        "__scatter_elt_8", "__scatter_elt_16", 
        "__scatter_elt_32", "__scatter_elt_64", 
    };
    bool modifiedAny = false;
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = m->module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL) {
            f->setLinkage(llvm::GlobalValue::PrivateLinkage);
            modifiedAny = true;
        }
    }
-    return true;
+    return modifiedAny;
 }
--- a/parse.yy
+++ b/parse.yy
@@ -102,15 +102,16 @@ static const char *lBuiltinTokens[] = {
    "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor", "char", 
    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
    "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
-    "inline", "int", "int32", "int64", "launch", "print", "reference", "return",
+    "inline", "int", "int8", "int16", "int32", "int64", "launch", "print",
    "reference", "return",
    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
    "unsigned", "varying", "void", "while", NULL 
 };
 static const char *lParamListTokens[] = {
    "bool", "char", "const", "double", "enum", "false", "float", "int",
-    "int32", "int64", "reference", "struct", "true", "uniform", "unsigned",
+    "int8", "int16", "int32", "int64", "reference", "struct", "true",
-    "varying", "void", NULL 
+     "uniform", "unsigned", "varying", "void", NULL 
 };
 %}
@@ -154,7 +155,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
 %token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
-%token TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
+%token TOKEN_INT8 TOKEN_INT16 TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
 %token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE
 %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
@@ -587,7 +588,8 @@ type_specifier
 atomic_var_type_specifier
    : TOKEN_VOID { $$ = AtomicType::Void; }
    | TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
-/*  | TOKEN_CHAR { UNIMPLEMENTED; } */
+    | TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
    | TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
    | TOKEN_INT { $$ = AtomicType::VaryingInt32; }
    | TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
    | TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -41,7 +41,6 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
 int64minmax(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -539,55 +538,14 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
+load_and_broadcast(8, i8, 8)
-  %mm = call i32 @__movmsk(<8 x i32> %mask)
+load_and_broadcast(8, i16, 16)
-  %any_on = icmp ne i32 %mm, 0
+load_and_broadcast(8, i32, 32)
-  br i1 %any_on, label %load, label %skip
+load_and_broadcast(8, i64, 64)
 load:
  ; TODO: make sure this becomes a vbroadcast...
  %ptr = bitcast i8 * %0 to i32 *
  %val = load i32 * %ptr
  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
  ret <8 x i32> %ret7
 skip:
  ret <8 x i32> undef
 }
 define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<8 x i32> %mask)
  %any_on = icmp ne i32 %mm, 0
  br i1 %any_on, label %load, label %skip
 load:
  ; TODO: make sure this becomes a vbroadcast...
  %ptr = bitcast i8 * %0 to i64 *
  %val = load i64 * %ptr
  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
  ret <8 x i64> %ret3
 skip:
  ret <8 x i64> undef
 }
 ; no masked load instruction for i8 and i16 types??
 load_masked(8, i8,  8,  1)
 load_masked(8, i16, 16, 2)
 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
@@ -623,6 +581,12 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 ; FIXME: there is no AVX instruction for these, but we could be clever
 ; by packing the bits down and setting the last 3/4 or half, respectively,
 ; of the mask to zero...  Not sure if this would be a win in the end
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)
 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
@@ -660,13 +624,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  ret void
 }
 masked_store_blend_8_16_by_8()
 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
-                                           <8 x i32>) nounwind alwaysinline {
+                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -36,7 +36,6 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 int8_16(4)
 int64minmax(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -380,29 +379,23 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
-define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
+masked_store_blend_8_16_by_4()
  per_lane(4, <4 x i32> %2, `
      ; compute address for this one
      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
      store i32 %storeval_ID, i32 * %ptr_ID')
  ret void
 }
 define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
  per_lane(4, <4 x i32> %2, `
      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
      store i64 %storeval_ID, i64 * %ptr_ID')
  ret void
 }
 gen_masked_store(4, i8, 8)
 gen_masked_store(4, i16, 16)
 gen_masked_store(4, i32, 32)
 gen_masked_store(4, i64, 64)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(4, i8, 8)
 load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)
 load_masked(4, i8,  8,  1)
 load_masked(4, i16, 16, 2)
 load_masked(4, i32, 32, 4)
 load_masked(4, i64, 64, 8)
@@ -411,7 +404,12 @@ load_masked(4, i64, 64, 8)
 ; define these with the macros from stdlib.m4
 gen_gather(4, i8)
 gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i8)
 gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -38,7 +38,6 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
 int64minmax(8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -435,44 +434,29 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
                               <8 x i32>) nounwind alwaysinline {
  per_lane(8, <8 x i32> %2, `
      ; compute address for this one
      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
      store i32 %storeval_ID, i32 * %ptr_ID')
  ret void
 }
 define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
                               <8 x i32>) nounwind alwaysinline {
  per_lane(8, <8 x i32> %2, `
      ; compute address for this one
      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
      store i64 %storeval_ID, i64 * %ptr_ID')
  ret void
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 load_and_broadcast(8, i8, 8)
 load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)
 load_masked(8, i8,  8,  1)
 load_masked(8, i16, 16, 2)
 load_masked(8, i32, 32, 4)
 load_masked(8, i64, 64, 8)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter
 gen_gather(8, i8)
 gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
 gen_scatter(8, i8)
 gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)
@@ -619,6 +603,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 gen_masked_store(8, i8, 8)
 gen_masked_store(8, i16, 16)
 gen_masked_store(8, i32, 32)
 gen_masked_store(8, i64, 64)
 masked_store_blend_8_16_by_8()
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -85,6 +85,14 @@ static inline float broadcast(float v, uniform int i) {
    return __broadcast_float(v, i);
 }
 static inline int8 broadcast(int8 v, uniform int i) {
    return __broadcast_int8(v, i);
 }
 static inline int16 broadcast(int16 v, uniform int i) {
    return __broadcast_int16(v, i);
 }
 static inline int32 broadcast(int32 v, uniform int i) {
    return __broadcast_int32(v, i);
 }
@@ -101,6 +109,14 @@ static inline float rotate(float v, uniform int i) {
    return __rotate_float(v, i);
 }
 static inline int8 rotate(int8 v, uniform int i) {
    return __rotate_int8(v, i);
 }
 static inline int16 rotate(int16 v, uniform int i) {
    return __rotate_int16(v, i);
 }
 static inline int32 rotate(int32 v, uniform int i) {
    return __rotate_int32(v, i);
 }
@@ -117,6 +133,14 @@ static inline float shuffle(float v, int i) {
    return __shuffle_float(v, i);
 }
 static inline int8 shuffle(int8 v, int i) {
    return __shuffle_int8(v, i);
 }
 static inline int16 shuffle(int16 v, int i) {
    return __shuffle_int16(v, i);
 }
 static inline int32 shuffle(int32 v, int i) {
    return __shuffle_int32(v, i);
 }
@@ -133,6 +157,14 @@ static inline float shuffle(float v0, float v1, int i) {
    return __shuffle2_float(v0, v1, i);
 }
 static inline int8 shuffle(int8 v0, int8 v1, int i) {
    return __shuffle2_int8(v0, v1, i);
 }
 static inline int16 shuffle(int16 v0, int16 v1, int i) {
    return __shuffle2_int16(v0, v1, i);
 }
 static inline int32 shuffle(int32 v0, int32 v1, int i) {
    return __shuffle2_int32(v0, v1, i);
 }
@@ -150,11 +182,27 @@ static inline uniform float extract(float x, uniform int i) {
    return floatbits(__extract_int32((int)intbits(x), i));
 }
-static inline uniform int extract(int x, uniform int i) {
+static inline uniform int8 extract(int8 x, uniform int i) {
    return __extract_int8(x, i);
 }
 static inline uniform unsigned int8 extract(unsigned int8 x, uniform int i) {
    return __extract_int8(x, (unsigned int)i);
 }
 static inline uniform int16 extract(int16 x, uniform int i) {
    return __extract_int16(x, i);
 }
 static inline uniform unsigned int16 extract(unsigned int16 x, uniform int i) {
    return __extract_int16(x, (unsigned int)i);
 }
 static inline uniform int32 extract(int32 x, uniform int i) {
    return __extract_int32(x, i);
 }
-static inline uniform unsigned int extract(unsigned int x, uniform int i) {
+static inline uniform unsigned int32 extract(unsigned int32 x, uniform int i) {
    return __extract_int32(x, (unsigned int)i);
 }
@@ -175,12 +223,30 @@ static inline float insert(float x, uniform int i, uniform float v) {
    return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
 }
-static inline int insert(int x, uniform int i, uniform int v) {
+static inline int8 insert(int8 x, uniform int i, uniform int8 v) {
    return __insert_int8(x, i, v);
 }
 static inline unsigned int8 insert(unsigned int8 x, uniform int i, 
                                    uniform unsigned int8 v) {
    return __insert_int8(x, (unsigned int)i, v);
 }
 static inline int16 insert(int16 x, uniform int i, uniform int16 v) {
    return __insert_int16(x, i, v);
 }
 static inline unsigned int16 insert(unsigned int16 x, uniform int i, 
                                    uniform unsigned int16 v) {
    return __insert_int16(x, (unsigned int)i, v);
 }
 static inline int32 insert(int32 x, uniform int i, uniform int32 v) {
    return __insert_int32(x, i, v);
 }
-static inline unsigned int insert(unsigned int x, uniform int i, 
+static inline unsigned int32 insert(unsigned int32 x, uniform int i, 
-                                  uniform unsigned int v) {
+                                    uniform unsigned int32 v) {
    return __insert_int32(x, (unsigned int)i, v);
 }
@@ -218,7 +284,7 @@ static inline uniform bool all(bool v) {
    return __movmsk(match) == (1 << programCount) - 1;
 }
-static inline uniform int popcnt(uniform int v) {
+static inline uniform int32 popcnt(uniform int32 v) {
    return __popcnt_int32(v);
 }
@@ -473,52 +539,7 @@ ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
 ATOMIC_DECL_CMPXCHG(double, double)
 ///////////////////////////////////////////////////////////////////////////
-// Load/store from/to 8/16-bit types
+// Floating-Point Math
 static inline int load_from_int8(uniform int a[], uniform int offset) {
    return __load_int8(a, offset, __mask);
 }
 static inline unsigned int load_from_uint8(uniform unsigned int a[], 
                                           uniform int offset) {
    return __load_uint8(a, offset, __mask);
 }
 static inline void store_to_int8(uniform int a[], uniform int offset, 
                                 unsigned int val) {
    __store_int8(a, offset, val, __mask);
 }
 static inline void store_to_uint8(uniform unsigned int a[], uniform int offset, 
                                  unsigned int val) {
    // Can use __store_int8 for unsigned stuff, since it truncates bits in
    // either case.
    __store_int8(a, offset, val, __mask);
 }
 static inline int load_from_int16(uniform int a[], uniform int offset) {
    return __load_int16(a, offset, __mask);
 }
 static inline unsigned int load_from_int16(uniform unsigned int a[], 
                                           uniform int offset) {
    return __load_uint16(a, offset, __mask);
 }
 static inline void store_to_int16(uniform int a[], uniform int offset, 
                                  int val) {
    __store_int16(a, offset, val, __mask);
 }
 static inline void store_to_uint16(uniform unsigned int a[], uniform int offset, 
                                   unsigned int val) {
    // Can use __store_int16 for unsigned stuff, since it truncates bits in
    // either case.
    __store_int16(a, offset, val, __mask);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Math
 static inline float abs(float a) {
    // Floating-point hack: zeroing the high bit clears the sign
@@ -622,6 +643,11 @@ static inline uniform float rcp(uniform float v) {
    return __rcp_uniform_float(v);
 }
 ///////////////////////////////////////////////////////////////////////////
 // min/max
 // float
 static inline float min(float a, float b) {
    return __min_varying_float(a, b);
 }
@@ -630,14 +656,6 @@ static inline uniform float min(uniform float a, uniform float b) {
    return __min_uniform_float(a, b);
 }
 static inline double min(double a, double b) {
    return __min_varying_double(a, b);
 }
 static inline uniform double min(uniform double a, uniform double b) {
    return __min_uniform_double(a, b);
 }
 static inline float max(float a, float b) {
    return __max_varying_float(a, b);
 }
@@ -646,6 +664,17 @@ static inline uniform float max(uniform float a, uniform float b) {
    return __max_uniform_float(a, b);
 }
 // double
 static inline double min(double a, double b) {
    return __min_varying_double(a, b);
 }
 static inline uniform double min(uniform double a, uniform double b) {
    return __min_uniform_double(a, b);
 }
 static inline double max(double a, double b) {
    return __max_varying_double(a, b);
 }
@@ -654,6 +683,80 @@ static inline uniform double max(uniform double a, uniform double b) {
    return __max_uniform_double(a, b);
 }
 // int8
 static inline uniform unsigned int8 min(uniform unsigned int8 a,
                                        uniform unsigned int8 b) {
    return (a < b) ? a : b;
 }
 static inline uniform unsigned int8 max(uniform unsigned int8 a, 
                                        uniform unsigned int8 b) {
    return (a > b) ? a : b;
 }
 static inline uniform int8 min(uniform int8 a, uniform int8 b) {
    return (a < b) ? a : b;
 }
 static inline uniform int8 max(uniform int8 a, uniform int8 b) {
    return (a > b) ? a : b;
 }
 static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
    return (a < b) ? a : b;
 }
 static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
    return (a > b) ? a : b;
 }
 static inline int8 min(int8 a, int8 b) {
    return (a < b) ? a : b;
 }
 static inline int8 max(int8 a, int8 b) {
    return (a > b) ? a : b;
 }
 // int16
 static inline uniform unsigned int16 min(uniform unsigned int16 a, 
                                         uniform unsigned int16 b) {
    return (a < b) ? a : b;
 }
 static inline uniform unsigned int16 max(uniform unsigned int16 a, 
                                         uniform unsigned int16 b) {
    return (a > b) ? a : b;
 }
 static inline uniform int16 min(uniform int16 a, uniform int16 b) {
    return (a < b) ? a : b;
 }
 static inline uniform int16 max(uniform int16 a, uniform int16 b) {
    return (a > b) ? a : b;
 }
 static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
    return (a < b) ? a : b;
 }
 static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
    return (a > b) ? a : b;
 }
 static inline int16 min(int16 a, int16 b) {
    return (a < b) ? a : b;
 }
 static inline int16 max(int16 a, int16 b) {
    return (a > b) ? a : b;
 }
 // int32
 static inline unsigned int min(unsigned int a, unsigned int b) {
    return __min_varying_uint32(a, b);
 }
@@ -686,6 +789,8 @@ static inline uniform int max(uniform int a, uniform int b) {
    return __max_uniform_int32(a, b);
 }
 // int64
 static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
    return __min_varying_uint64(a, b);
 }
@@ -718,6 +823,11 @@ static inline uniform int64 max(uniform int64 a, uniform int64 b) {
    return __max_uniform_int64(a, b);
 }
 ///////////////////////////////////////////////////////////////////////////
 // clamps
 // float
 static inline float clamp(float v, float low, float high) {
    return min(max(v, low), high);
 }
@@ -726,6 +836,52 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
    return min(max(v, low), high);
 }
 // int8
 static inline unsigned int8 clamp(unsigned int8 v, unsigned int8 low, 
                                   unsigned int8 high) {
    return min(max(v, low), high);
 }
 static inline uniform unsigned int8 clamp(uniform unsigned int8 v, 
                                           uniform unsigned int8 low, 
                                           uniform unsigned int8 high) {
    return min(max(v, low), high);
 }
 static inline int8 clamp(int8 v, int8 low, int8 high) {
    return min(max(v, low), high);
 }
 static inline uniform int8 clamp(uniform int8 v, uniform int8 low, 
                                  uniform int8 high) {
    return min(max(v, low), high);
 }
 // int16
 static inline unsigned int16 clamp(unsigned int16 v, unsigned int16 low, 
                                   unsigned int16 high) {
    return min(max(v, low), high);
 }
 static inline uniform unsigned int16 clamp(uniform unsigned int16 v, 
                                           uniform unsigned int16 low, 
                                           uniform unsigned int16 high) {
    return min(max(v, low), high);
 }
 static inline int16 clamp(int16 v, int16 low, int16 high) {
    return min(max(v, low), high);
 }
 static inline uniform int16 clamp(uniform int16 v, uniform int16 low, 
                                  uniform int16 high) {
    return min(max(v, low), high);
 }
 // int32
 static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
    return min(max(v, low), high);
 }
@@ -735,15 +891,6 @@ static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigne
    return min(max(v, low), high);
 }
 static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, unsigned int64 high) {
    return min(max(v, low), high);
 }
 static inline uniform unsigned int64 clamp(uniform unsigned int64 v, uniform unsigned int64 low, 
                                           uniform unsigned int64 high) {
    return min(max(v, low), high);
 }
 static inline int clamp(int v, int low, int high) {
    return min(max(v, low), high);
 }
@@ -752,11 +899,25 @@ static inline uniform int clamp(uniform int v, uniform int low, uniform int high
    return min(max(v, low), high);
 }
 // int64
 static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, 
                                   unsigned int64 high) {
    return min(max(v, low), high);
 }
 static inline uniform unsigned int64 clamp(uniform unsigned int64 v, 
                                           uniform unsigned int64 low, 
                                           uniform unsigned int64 high) {
    return min(max(v, low), high);
 }
 static inline int64 clamp(int64 v, int64 low, int64 high) {
    return min(max(v, low), high);
 }
-static inline uniform int64 clamp(uniform int64 v, uniform int64 low, uniform int64 high) {
+static inline uniform int64 clamp(uniform int64 v, uniform int64 low, 
                                  uniform int64 high) {
    return min(max(v, low), high);
 }
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -566,6 +566,28 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 define internal i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline {
  %extract = extractelement <$1 x i8> %0, i32 %1
  ret i8 %extract
 }
 define internal <$1 x i8> @__insert_int8(<$1 x i8>, i32, 
                                           i8) nounwind readnone alwaysinline {
  %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1
  ret <$1 x i8> %insert
 }
 define internal i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline {
  %extract = extractelement <$1 x i16> %0, i32 %1
  ret i16 %extract
 }
 define internal <$1 x i16> @__insert_int16(<$1 x i16>, i32, 
                                           i16) nounwind readnone alwaysinline {
  %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1
  ret <$1 x i16> %insert
 }
 define internal i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
  %extract = extractelement <$1 x i32> %0, i32 %1
  ret i32 %extract
@@ -588,6 +610,8 @@ define internal <$1 x i64> @__insert_int64(<$1 x i64>, i32,
  ret <$1 x i64> %insert
 }
 shuffles($1, i8, int8, 1)
 shuffles($1, i16, int16, 2)
 shuffles($1, float, float, 4)
 shuffles($1, i32, int32, 4)
 shuffles($1, double, double, 8)
@@ -901,171 +925,6 @@ i64minmax($1,min,uint64,ult)
 i64minmax($1,max,uint64,ugt)
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Definitions of 8 and 16-bit load and store functions
 ;;
 ;; The `int8_16' macro defines functions related to loading and storing 8 and
 ;; 16-bit values in memory, converting to and from i32.  (This is a workaround
 ;; to be able to use in-memory values of types in ispc programs, since the
 ;; compiler doesn't yet support 8 and 16-bit datatypes...
 ;;
 ;; Arguments to pass to `int8_16':
 ;; $1: vector width of the target
 define(`int8_16', `
 define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset,
                                         <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %doload, label %skip
 doload:  
  %ptr8 = bitcast [0 x i32] *%0 to i8 *
  %ptr = getelementptr i8 * %ptr8, i32 %offset
  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
  %val = load i`'eval(8*$1) * %ptr64, align 1
  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
  ; unsigned, so zero-extend to i32... 
  %ret = zext <$1 x i8> %vval to <$1 x i32>
  ret <$1 x i32> %ret
 skip:
  ret <$1 x i32> undef
 }
 define internal <$1 x i32> @__load_int8([0 x i32] *, i32 %offset,
                                        <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %doload, label %skip
 doload:  
  %ptr8 = bitcast [0 x i32] *%0 to i8 *
  %ptr = getelementptr i8 * %ptr8, i32 %offset
  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
  %val = load i`'eval(8*$1) * %ptr64, align 1
  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
  ; signed, so sign-extend to i32... 
  %ret = sext <$1 x i8> %vval to <$1 x i32>
  ret <$1 x i32> %ret
 skip:
  ret <$1 x i32> undef
 }
 define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset,
                                          <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %doload, label %skip
 doload:  
  %ptr16 = bitcast [0 x i32] *%0 to i16 *
  %ptr = getelementptr i16 * %ptr16, i32 %offset
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
  %val = load i`'eval(16*$1) * %ptr64, align 2
  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
  ; unsigned, so use zero-extend...
  %ret = zext <$1 x i16> %vval to <$1 x i32>
  ret <$1 x i32> %ret
 skip:
  ret <$1 x i32> undef
 }
 define internal <$1 x i32> @__load_int16([0 x i32] *, i32 %offset,
                                         <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %doload, label %skip
 doload:  
  %ptr16 = bitcast [0 x i32] *%0 to i16 *
  %ptr = getelementptr i16 * %ptr16, i32 %offset
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
  %val = load i`'eval(16*$1) * %ptr64, align 2
  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
  ; signed, so use sign-extend...
  %ret = sext <$1 x i16> %vval to <$1 x i32>
  ret <$1 x i32> %ret
 skip:
  ret <$1 x i32> undef
 }
 define internal void @__store_int8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
                                   <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %dostore, label %skip
 dostore:  
  %val = trunc <$1 x i32> %val32 to <$1 x i8>
  %val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
  %mask8 = trunc <$1 x i32> %mask to <$1 x i8>
  %mask64 = bitcast <$1 x i8> %mask8 to i`'eval(8*$1)
  %notmask = xor i`'eval(8*$1) %mask64, -1
  %ptr8 = bitcast [0 x i32] *%0 to i8 *
  %ptr = getelementptr i8 * %ptr8, i32 %offset
  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
  ;; load the old value, use logical ops to blend based on the mask, then
  ;; store the result back
  %old = load i`'eval(8*$1) * %ptr64, align 1
  %oldmasked = and i`'eval(8*$1) %old, %notmask
  %newmasked = and i`'eval(8*$1) %val64, %mask64
  %final = or i`'eval(8*$1) %oldmasked, %newmasked
  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
  ret void
 skip:
  ret void
 }
 define internal void @__store_int16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
                                    <$1 x i32> %mask) nounwind alwaysinline {
  %mm = call i32 @__movmsk(<$1 x i32> %mask)
  %any = icmp ne i32 %mm, 0
  br i1 %any, label %dostore, label %skip
 dostore:
  %val = trunc <$1 x i32> %val32 to <$1 x i16>
  %val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
  %mask8 = trunc <$1 x i32> %mask to <$1 x i16>
  %mask64 = bitcast <$1 x i16> %mask8 to i`'eval(16*$1)
  %notmask = xor i`'eval(16*$1) %mask64, -1
  %ptr16 = bitcast [0 x i32] *%0 to i16 *
  %ptr = getelementptr i16 * %ptr16, i32 %offset
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
  ;; as above, use mask to do blending with logical ops...
  %old = load i`'eval(16*$1) * %ptr64, align 2
  %oldmasked = and i`'eval(16*$1) %old, %notmask
  %newmasked = and i`'eval(16*$1) %val64, %mask64
  %final = or i`'eval(16*$1) %oldmasked, %newmasked
  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
  ret void
 skip:
  ret void
 }
 '
 )
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Emit code to safely load a scalar value and broadcast it across the
 ;; elements of a vector.  Parameters:
@@ -1150,6 +1009,105 @@ return:
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 ;; emit code to do masked store as a set of per-lane scalar stores
 ;; parameters:
 ;; $1: target vector width
 ;; $2: llvm type of elements
 ;; $3: suffix for function name
 define(`gen_masked_store', `
 define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %2, `
      %ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
      %storeval_ID = extractelement <$1 x $2> %1, i32 LANE
      store $2 %storeval_ID, $2 * %ptr_ID')
  ret void
 }
 ')
 define(`masked_store_blend_8_16_by_4', `
 define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
                                    <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i8> * %0
  %old32 = bitcast <4 x i8> %old to i32
  %new32 = bitcast <4 x i8> %1 to i32
  %mask8 = trunc <4 x i32> %2 to <4 x i8>
  %mask32 = bitcast <4 x i8> %mask8 to i32
  %notmask32 = xor i32 %mask32, -1
  %newmasked = and i32 %new32, %mask32
  %oldmasked = and i32 %old32, %notmask32
  %result = or i32 %newmasked, %oldmasked
  %resultvec = bitcast i32 %result to <4 x i8>
  store <4 x i8> %resultvec, <4 x i8> * %0
  ret void
 }
 define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
                                     <4 x i32>) nounwind alwaysinline {
  %old = load <4 x i16> * %0
  %old64 = bitcast <4 x i16> %old to i64
  %new64 = bitcast <4 x i16> %1 to i64
  %mask16 = trunc <4 x i32> %2 to <4 x i16>
  %mask64 = bitcast <4 x i16> %mask16 to i64
  %notmask64 = xor i64 %mask64, -1
  %newmasked = and i64 %new64, %mask64
  %oldmasked = and i64 %old64, %notmask64
  %result = or i64 %newmasked, %oldmasked
  %resultvec = bitcast i64 %result to <4 x i16>
  store <4 x i16> %resultvec, <4 x i16> * %0
  ret void
 }
 ')
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
                                    <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i8> * %0
  %old64 = bitcast <8 x i8> %old to i64
  %new64 = bitcast <8 x i8> %1 to i64
  %mask8 = trunc <8 x i32> %2 to <8 x i8>
  %mask64 = bitcast <8 x i8> %mask8 to i64
  %notmask64 = xor i64 %mask64, -1
  %newmasked = and i64 %new64, %mask64
  %oldmasked = and i64 %old64, %notmask64
  %result = or i64 %newmasked, %oldmasked
  %resultvec = bitcast i64 %result to <8 x i8>
  store <8 x i8> %resultvec, <8 x i8> * %0
  ret void
 }
 define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
                                     <8 x i32>) nounwind alwaysinline {
  %old = load <8 x i16> * %0
  %old128 = bitcast <8 x i16> %old to i128
  %new128 = bitcast <8 x i16> %1 to i128
  %mask16 = trunc <8 x i32> %2 to <8 x i16>
  %mask128 = bitcast <8 x i16> %mask16 to i128
  %notmask128 = xor i128 %mask128, -1
  %newmasked = and i128 %new128, %mask128
  %oldmasked = and i128 %old128, %notmask128
  %result = or i128 %newmasked, %oldmasked
  %resultvec = bitcast i128 %result to <8 x i16>
  store <8 x i16> %resultvec, <8 x i16> * %0
  ret void
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1405,6 +1405,18 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
            return NULL;
    }
    // Just int8 and int16 types to int32s...
    const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
    if (baseType == AtomicType::UniformInt8 ||
        baseType == AtomicType::UniformUInt8 ||
        baseType == AtomicType::UniformInt16 ||
        baseType == AtomicType::UniformUInt16) {
        expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
                                                        AtomicType::VaryingInt32, 
                                expr, expr->pos);
        type = expr->GetType();
    }
    char t = lEncodeType(type->GetAsNonConstType());
    if (t == '\0') {
        Error(expr->pos, "Only atomic types are allowed in print statements; "
--- a/tests/array-mixed-unif-vary-indexing-2.ispc
+++ b/tests/array-mixed-unif-vary-indexing-2.ispc
@@ -8,7 +8,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;
    // all are 2 except (3,4) = 0, (1,4) = 1, (2,4) = 1, (4,4) = 1
    if (a == 3.)
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;
    // all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
    if (a == 3.)
--- a/tests/array-mixed-unif-vary-indexing.ispc
+++ b/tests/array-mixed-unif-vary-indexing.ispc
@@ -8,7 +8,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;
    x[a][b-1] = 0;
    RET[programIndex] = x[2][a];
--- a/tests/broadcast-2.ispc
+++ b/tests/broadcast-2.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int16 a = aFOO[programIndex]; 
    int16 b = broadcast(a, 2);
    RET[programIndex] = b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
 }
--- a/tests/broadcast-3.ispc
+++ b/tests/broadcast-3.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 a = aFOO[programIndex]; 
    int8 br = broadcast(a, (uniform int)b-2);
    RET[programIndex] = br;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 4;
 }
--- a/tests/gather-int16-1.ispc
+++ b/tests/gather-int16-1.ispc
@@ -0,0 +1,19 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 x[programCount];
    x[programIndex] = programIndex;
    int a = aFOO[programIndex]-1;
    unsigned int16 v;
    if (programIndex < 2)
        v = x[a];
    else
        v = 2;
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
    RET[0] = 0;
    RET[1] = 1;
 }
--- a/tests/gather-int16.ispc
+++ b/tests/gather-int16.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 x[programCount];
    x[programIndex] = programIndex;
    int a = aFOO[programIndex]-1;
    unsigned int16 v = x[a];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/gather-int8-1.ispc
+++ b/tests/gather-int8-1.ispc
@@ -0,0 +1,19 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 x[programCount];
    x[programIndex] = programIndex;
    int a = aFOO[programIndex]-1;
    unsigned int8 v;
    if (programIndex < 2)
        v = x[a];
    else
        v = 2;
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
    RET[0] = 0;
    RET[1] = 1;
 }
--- a/tests/gather-int8.ispc
+++ b/tests/gather-int8.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 x[programCount];
    x[programIndex] = programIndex;
    int a = aFOO[programIndex]-1;
    unsigned int8 v = x[a];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/int16-wrap.ispc
+++ b/tests/int16-wrap.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bb) {
    unsigned int16 a = aFOO[programIndex], b = bb;
    RET[programIndex] = ((unsigned int16)4000*a)+b; 
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = (((4000*(programIndex+1))&0xffff)+5)&0xffff;
 }
--- a/tests/int8-wrap.ispc
+++ b/tests/int8-wrap.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bb) {
    unsigned int8 a = aFOO[programIndex], b = bb;
    RET[programIndex] = ((unsigned int8)100*a)+b; 
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = (((100*(programIndex+1))&0xff)+5)&0xff;
 }
--- a/tests/load-int16-1.ispc
+++ b/tests/load-int16-1.ispc
@@ -1,13 +1,17 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[9] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
+    uniform int16 x[programCount];
-                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f,
+    x[programIndex] = aFOO[programIndex];
-                         0x00120011 };
+    unsigned int16 v = 0;
-    unsigned int v = load_from_int16(x, 1);
+    if (programIndex & 1)
        v = x[programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
+    if (programIndex & 1)
        RET[programIndex] = 1+programIndex;
    else
        RET[programIndex] = 0;
 }
--- a/tests/load-int16.ispc
+++ b/tests/load-int16.ispc
@@ -1,9 +1,9 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[8] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
+    uniform int16 x[programCount];
-                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f };
+    x[programIndex] = aFOO[programIndex];
-    unsigned int v = load_from_int16(x, 0);
+    unsigned int16 v = x[programIndex];
    RET[programIndex] = v;
 }
--- a/tests/load-int8-1.ispc
+++ b/tests/load-int8-1.ispc
@@ -1,12 +1,17 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[5] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d,
+    uniform int8 x[programCount];
-                         0x14131211 };
+    x[programIndex] = aFOO[programIndex];
-    unsigned int v = load_from_int8(x, 2);
+    unsigned int8 v = 0;
    if (programIndex & 1)
        v = x[programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
-    RET[programIndex] = 3+programIndex;
+    if (programIndex & 1)
        RET[programIndex] = 1+programIndex;
    else
        RET[programIndex] = 0;
 }
--- a/tests/load-int8.ispc
+++ b/tests/load-int8.ispc
@@ -1,8 +1,9 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[4] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d };
+    uniform int8 x[programCount];
-    unsigned int v = load_from_int8(x, 0);
+    x[programIndex] = aFOO[programIndex];
    unsigned int8 v = x[programIndex];
    RET[programIndex] = v;
 }
--- a/tests/nested-structs-2.ispc
+++ b/tests/nested-structs-2.ispc
@@ -16,7 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform Bar bar;
    for (uniform int i = 0; i < 6; ++i)
        for (uniform int j = 0; j < 18; ++j)
-            bar.foo[i].f[j] = 2.;
+            bar.foo[i].f[j] = 2.+b-5;
    bar.foo[5].f[a] = a;
    RET[programIndex] = bar.foo[b].f[a];
--- a/tests/nested-structs.ispc
+++ b/tests/nested-structs.ispc
@@ -1,8 +1,6 @@
 export uniform int width() { return programCount; }
 struct Foo {
    float f[6];
 };
@@ -16,7 +14,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform Bar bar;
    for (uniform int i = 0; i < 6; ++i)
        for (uniform int j = 0; j < 6; ++j)
-            bar.foo[i].f[j] = 2.;
+            bar.foo[i].f[j] = 2.+b-5;
    RET[programIndex] = bar.foo[b].f[b];
 }
--- a/tests/op-plus-equals-ensure-one-lhs-eval.ispc
+++ b/tests/op-plus-equals-ensure-one-lhs-eval.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform float foo[16];
    for (uniform int i = 0; i < 16; ++i)
-        foo[i] = 1;
+        foo[i] = i;
    uniform int i = 0;
    foo[i++] += 1;
--- a/tests/pass-varying-lvalue-to-ref.ispc
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -6,10 +6,10 @@ void inc(reference float v) { ++v; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform float foo[32];
    for (uniform int i = 0; i < 32; ++i)
-        foo[i] = 10;
+        foo[i] = 10+i;
    int a = (int)aa[programIndex];
    inc(foo[a]);
-    ret[programIndex] = foo[programIndex];
+    ret[programIndex] = foo[programIndex]-programIndex;
 }
 export void result(uniform float ret[]) {
--- a/tests/rotate-5.ispc
+++ b/tests/rotate-5.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 a = aFOO[programIndex]; 
    int8 rot = rotate(a, 2);
    RET[programIndex] = rot;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1 + (programIndex + 2) % programCount;
 }
--- a/tests/rotate-6.ispc
+++ b/tests/rotate-6.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int16 a = aFOO[programIndex]; 
    int16 rot = rotate(a, -1);
    RET[programIndex] = rot;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
 }
--- a/tests/scatter-int16-1.ispc
+++ b/tests/scatter-int16-1.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 x[programCount];
    x[programIndex] = -1;
    int a = aFOO[programIndex]-1;
    if (programIndex < 3)
        x[a] = programIndex;
    RET[programIndex] = x[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = -1;
    RET[0] = 0;
    RET[1] = 1;
    RET[2] = 2;
 }
--- a/tests/scatter-int16.ispc
+++ b/tests/scatter-int16.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int16 x[programCount];
    x[programIndex] = 0;
    int a = aFOO[programIndex]-1;
    x[a] = programIndex;
    RET[programIndex] = x[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/scatter-int8-1.ispc
+++ b/tests/scatter-int8-1.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 x[programCount];
    x[programIndex] = -1;
    int a = aFOO[programIndex]-1;
    if (programIndex < 3)
        x[a] = programIndex;
    RET[programIndex] = x[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = -1;
    RET[0] = 0;
    RET[1] = 1;
    RET[2] = 2;
 }
--- a/tests/scatter-int8.ispc
+++ b/tests/scatter-int8.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int8 x[programCount];
    x[programIndex] = 0;
    int a = aFOO[programIndex]-1;
    x[a] = programIndex;
    RET[programIndex] = x[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/shuffle-3.ispc
+++ b/tests/shuffle-3.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 a = aFOO[programIndex]; 
    int8 shuf = shuffle(a, 1);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
 }
--- a/tests/shuffle-4.ispc
+++ b/tests/shuffle-4.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int16 a = aFOO[programIndex]; 
    int reverse = programCount - 1 - programIndex;
    int16 shuf = shuffle(a, reverse);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programCount - programIndex;
 }
--- a/tests/shuffle-5.ispc
+++ b/tests/shuffle-5.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 a = aFOO[programIndex]; 
    int reverse = programCount - 1 - programIndex + (int)b - 5;
    int8 shuf = shuffle(a, reverse);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programCount - programIndex;
 }
--- a/tests/shuffle2-11.ispc
+++ b/tests/shuffle2-11.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int16 aa = aFOO[programIndex]; 
    int16 bb = aa + programCount;
    int16 shuf = shuffle(aa, bb, 2*programIndex);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1 + 2*programIndex;
 }
--- a/tests/shuffle2-6.ispc
+++ b/tests/shuffle2-6.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 aa = aFOO[programIndex]; 
    int8 bb = aa + programCount;
    int8 shuf = shuffle(aa, bb, 1);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
 }
--- a/tests/shuffle2-7.ispc
+++ b/tests/shuffle2-7.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int16 aa = aFOO[programIndex]; 
    int16 bb = aa + programCount;
    int16 shuf = shuffle(aa, bb, programCount + 1);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2 + programCount;
 }
--- a/tests/shuffle2-8.ispc
+++ b/tests/shuffle2-8.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int8 aa = aFOO[programIndex]; 
    int8 bb = aa + programCount;
    int8 shuf = shuffle(aa, bb, programIndex + 2);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 3 + programIndex;
 }
--- a/tests/shuffle2-9.ispc
+++ b/tests/shuffle2-9.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    int16 aa = aFOO[programIndex]; 
    int16 bb = aa + programCount;
    int16 shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
    RET[programIndex] = shuf;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 3 + programIndex;
 }
--- a/tests/store-int16-1.ispc
+++ b/tests/store-int16-1.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[16];
+    uniform unsigned int16 x[2*programCount];
-    for (uniform int i = 0; i < 16; ++i)
+    for (uniform int i = 0; i < 2*programCount; ++i)
-        x[i] = 0xffffffff;
+        x[i] = 0xffff;
-    unsigned int val = aFOO[programIndex];
+    unsigned int16 val = aFOO[programIndex];
-    store_to_int16(x, 5, val);
+    x[2+programIndex] = val;
-    unsigned int v = load_from_int16(x, 6);
+    RET[programIndex] = x[1+programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
+    RET[programIndex] = programIndex;
-    RET[programCount-1] = (unsigned int)0xffffffff;
+    RET[0] = 65535;
 }
--- a/tests/store-int16-2.ispc
+++ b/tests/store-int16-2.ispc
@@ -0,0 +1,19 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int16 x[2*programCount];
    for (uniform int i = 0; i < 2*programCount; ++i)
        x[i] = 0xffff;
    unsigned int16 val = aFOO[programIndex];
    if (programIndex & 1)
        x[2+programIndex] = val;
    RET[programIndex] = x[1+programIndex];
 }
 export void result(uniform float RET[]) {
    if (programIndex & 1)
        RET[programIndex] = 65535;
    else
        RET[programIndex] = programIndex;
    RET[0] = 65535;
 }
--- a/tests/store-int16.ispc
+++ b/tests/store-int16.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[16];
+    uniform int16 x[2*programCount];
-    for (uniform int i = 0; i < 16; ++i)
+    for (uniform int i = 0; i < 2*programCount; ++i)
-        x[i] = 0xffffffff;
+        x[i] = 0xffff;
-    unsigned int val = aFOO[programIndex];
+    unsigned int8 val = aFOO[programIndex];
-    store_to_int16(x, 5, val);
+    x[2+programIndex] = val;
-    int v = load_from_int16(x, 6);
+    RET[programIndex] = x[1+programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
+    RET[programIndex] = programIndex;
-    RET[programCount-1] = -1;
+    RET[0] = -1.;
 }
--- a/tests/store-int8-1.ispc
+++ b/tests/store-int8-1.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int x[8];
+    uniform unsigned int8 x[2*programCount];
-    for (uniform int i = 0; i < 8; ++i)
+    for (uniform int i = 0; i < 2*programCount; ++i)
-        x[i] = 0xffffffff;
+        x[i] = 0xff;
-    unsigned int val = aFOO[programIndex];
+    unsigned int8 val = aFOO[programIndex];
-    store_to_uint8(x, 2, val);
+    x[2+programIndex] = val;
-    unsigned int v = load_from_uint8(x, 1);
+    RET[programIndex] = x[1+programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
-    RET[0] = (unsigned int)0xff;
+    RET[0] = 255;
 }
--- a/tests/store-int8-2.ispc
+++ b/tests/store-int8-2.ispc
@@ -0,0 +1,19 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform unsigned int8 x[2*programCount];
    for (uniform int i = 0; i < 2*programCount; ++i)
        x[i] = 0xff;
    unsigned int8 val = aFOO[programIndex];
    if (programIndex & 1)
        x[2+programIndex] = val;
    RET[programIndex] = x[1+programIndex];
 }
 export void result(uniform float RET[]) {
    if (programIndex & 1)
        RET[programIndex] = 255;
    else
        RET[programIndex] = programIndex;
    RET[0] = 255;
 }
--- a/tests/store-int8.ispc
+++ b/tests/store-int8.ispc
@@ -1,13 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[8];
+    uniform int8 x[2*programCount];
-    for (uniform int i = 0; i < 8; ++i)
+    for (uniform int i = 0; i < 2*programCount; ++i)
-        x[i] = 0xffffffff;
+        x[i] = 0xff;
-    unsigned int val = aFOO[programIndex];
+    unsigned int8 val = aFOO[programIndex];
-    store_to_int8(x, 2, val);
+    x[2+programIndex] = val;
-    int v = load_from_int8(x, 1);
+    RET[programIndex] = x[1+programIndex];
    RET[programIndex] = v;
 }
 export void result(uniform float RET[]) {
--- a/tests/write-same-loc.ispc
+++ b/tests/write-same-loc.ispc
@@ -4,12 +4,12 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform int foo[10];
    for (uniform int i = 0; i < 10; ++i)
-        foo[i] = 10;
+        foo[i] = 10+i;
    int bb = b;
    foo[bb] = 0;
    ret[programIndex] = foo[4] + foo[5];
 }
 export void result(uniform float ret[]) {
-    ret[programIndex] = 10;
+    ret[programIndex] = 14;
 }
--- a/type.cpp
+++ b/type.cpp
@@ -74,6 +74,14 @@ lShouldPrintName(const std::string &name) {
 const AtomicType *AtomicType::UniformBool = new AtomicType(TYPE_BOOL, true, false);
 const AtomicType *AtomicType::VaryingBool = new AtomicType(TYPE_BOOL, false, false);
 const AtomicType *AtomicType::UniformInt8 = new AtomicType(TYPE_INT8, true, false);
 const AtomicType *AtomicType::VaryingInt8 = new AtomicType(TYPE_INT8, false, false);
 const AtomicType *AtomicType::UniformUInt8 = new AtomicType(TYPE_UINT8, true, false);
 const AtomicType *AtomicType::VaryingUInt8 = new AtomicType(TYPE_UINT8, false, false);
 const AtomicType *AtomicType::UniformInt16 = new AtomicType(TYPE_INT16, true, false);
 const AtomicType *AtomicType::VaryingInt16 = new AtomicType(TYPE_INT16, false, false);
 const AtomicType *AtomicType::UniformUInt16 = new AtomicType(TYPE_UINT16, true, false);
 const AtomicType *AtomicType::VaryingUInt16 = new AtomicType(TYPE_UINT16, false, false);
 const AtomicType *AtomicType::UniformInt32 = new AtomicType(TYPE_INT32, true, false);
 const AtomicType *AtomicType::VaryingInt32 = new AtomicType(TYPE_INT32, false, false);
 const AtomicType *AtomicType::UniformUInt32 = new AtomicType(TYPE_UINT32, true, false);
@@ -89,6 +97,14 @@ const AtomicType *AtomicType::VaryingDouble = new AtomicType(TYPE_DOUBLE, false,
 const AtomicType *AtomicType::UniformConstBool = new AtomicType(TYPE_BOOL, true, true);
 const AtomicType *AtomicType::VaryingConstBool = new AtomicType(TYPE_BOOL, false, true);
 const AtomicType *AtomicType::UniformConstInt8 = new AtomicType(TYPE_INT8, true, true);
 const AtomicType *AtomicType::VaryingConstInt8 = new AtomicType(TYPE_INT8, false, true);
 const AtomicType *AtomicType::UniformConstUInt8 = new AtomicType(TYPE_UINT8, true, true);
 const AtomicType *AtomicType::VaryingConstUInt8 = new AtomicType(TYPE_UINT8, false, true);
 const AtomicType *AtomicType::UniformConstInt16 = new AtomicType(TYPE_INT16, true, true);
 const AtomicType *AtomicType::VaryingConstInt16 = new AtomicType(TYPE_INT16, false, true);
 const AtomicType *AtomicType::UniformConstUInt16 = new AtomicType(TYPE_UINT16, true, true);
 const AtomicType *AtomicType::VaryingConstUInt16 = new AtomicType(TYPE_UINT16, false, true);
 const AtomicType *AtomicType::UniformConstInt32 = new AtomicType(TYPE_INT32, true, true);
 const AtomicType *AtomicType::VaryingConstInt32 = new AtomicType(TYPE_INT32, false, true);
 const AtomicType *AtomicType::UniformConstUInt32 = new AtomicType(TYPE_UINT32, true, true);
@@ -101,6 +117,7 @@ const AtomicType *AtomicType::UniformConstUInt64 = new AtomicType(TYPE_UINT64, t
 const AtomicType *AtomicType::VaryingConstUInt64 = new AtomicType(TYPE_UINT64, false, true);
 const AtomicType *AtomicType::UniformConstDouble = new AtomicType(TYPE_DOUBLE, true, true);
 const AtomicType *AtomicType::VaryingConstDouble = new AtomicType(TYPE_DOUBLE, false, true);
 const AtomicType *AtomicType::Void = new AtomicType(TYPE_VOID, true, false);
@@ -123,14 +140,17 @@ AtomicType::IsFloatType() const {
 bool
 AtomicType::IsIntType() const {
-    return (basicType == TYPE_INT32 || basicType == TYPE_UINT32 ||
+    return (basicType == TYPE_INT8  || basicType == TYPE_UINT8  ||
            basicType == TYPE_INT16 || basicType == TYPE_UINT16 ||
            basicType == TYPE_INT32 || basicType == TYPE_UINT32 ||
            basicType == TYPE_INT64 || basicType == TYPE_UINT64);
 }
 bool
 AtomicType::IsUnsignedType() const {
-    return (basicType == TYPE_UINT32 || basicType == TYPE_UINT64);
+    return (basicType == TYPE_UINT8  || basicType == TYPE_UINT16 ||
            basicType == TYPE_UINT32 || basicType == TYPE_UINT64);
 }
@@ -151,10 +171,18 @@ AtomicType::GetAsUnsignedType() const {
    if (IsUnsignedType()) 
        return this;
-    if (this == AtomicType::UniformInt32)           return AtomicType::UniformUInt32;
+    if      (this == AtomicType::UniformInt8)       return AtomicType::UniformUInt8;
    else if (this == AtomicType::VaryingInt8)       return AtomicType::VaryingUInt8;
    else if (this == AtomicType::UniformInt16)      return AtomicType::UniformUInt16;
    else if (this == AtomicType::VaryingInt16)      return AtomicType::VaryingUInt16;
    else if (this == AtomicType::UniformInt32)      return AtomicType::UniformUInt32;
    else if (this == AtomicType::VaryingInt32)      return AtomicType::VaryingUInt32;
    else if (this == AtomicType::UniformInt64)      return AtomicType::UniformUInt64;
    else if (this == AtomicType::VaryingInt64)      return AtomicType::VaryingUInt64;
    else if (this == AtomicType::UniformConstInt8)  return AtomicType::UniformConstUInt8;
    else if (this == AtomicType::VaryingConstInt8)  return AtomicType::VaryingConstUInt8;
    else if (this == AtomicType::UniformConstInt16) return AtomicType::UniformConstUInt16;
    else if (this == AtomicType::VaryingConstInt16) return AtomicType::VaryingConstUInt16;
    else if (this == AtomicType::UniformConstInt32) return AtomicType::UniformConstUInt32;
    else if (this == AtomicType::VaryingConstInt32) return AtomicType::VaryingConstUInt32;
    else if (this == AtomicType::UniformConstInt64) return AtomicType::UniformConstUInt64;
@@ -170,6 +198,10 @@ AtomicType::GetAsConstType() const {
    switch (basicType) {
    case TYPE_BOOL:    return isUniform ? UniformConstBool   : VaryingConstBool;
    case TYPE_INT8:    return isUniform ? UniformConstInt8   : VaryingConstInt8;
    case TYPE_UINT8:   return isUniform ? UniformConstUInt8  : VaryingConstUInt8;
    case TYPE_INT16:   return isUniform ? UniformConstInt16  : VaryingConstInt16;
    case TYPE_UINT16:  return isUniform ? UniformConstUInt16 : VaryingConstUInt16;
    case TYPE_INT32:   return isUniform ? UniformConstInt32  : VaryingConstInt32;
    case TYPE_UINT32:  return isUniform ? UniformConstUInt32 : VaryingConstUInt32;
    case TYPE_FLOAT:   return isUniform ? UniformConstFloat  : VaryingConstFloat;
@@ -190,6 +222,10 @@ AtomicType::GetAsNonConstType() const {
    switch (basicType) {
    case TYPE_BOOL:    return isUniform ? UniformBool   : VaryingBool;
    case TYPE_INT8:    return isUniform ? UniformInt8   : VaryingInt8;
    case TYPE_UINT8:   return isUniform ? UniformUInt8  : VaryingUInt8;
    case TYPE_INT16:   return isUniform ? UniformInt16  : VaryingInt16;
    case TYPE_UINT16:  return isUniform ? UniformUInt16 : VaryingUInt16;
    case TYPE_INT32:   return isUniform ? UniformInt32  : VaryingInt32;
    case TYPE_UINT32:  return isUniform ? UniformUInt32 : VaryingUInt32;
    case TYPE_FLOAT:   return isUniform ? UniformFloat  : VaryingFloat;
@@ -216,13 +252,17 @@ AtomicType::GetAsVaryingType() const {
    switch (basicType) {
    case TYPE_VOID:   return this;
-    case TYPE_BOOL:   return isConst ? AtomicType::VaryingConstBool   : AtomicType::VaryingBool;
+    case TYPE_BOOL:   return isConst ? VaryingConstBool   : VaryingBool;
-    case TYPE_INT32:  return isConst ? AtomicType::VaryingConstInt32  : AtomicType::VaryingInt32;
+    case TYPE_INT8:   return isConst ? VaryingConstInt8   : VaryingInt8;
-    case TYPE_UINT32: return isConst ? AtomicType::VaryingConstUInt32 : AtomicType::VaryingUInt32;
+    case TYPE_UINT8:  return isConst ? VaryingConstUInt8  : VaryingUInt8;
-    case TYPE_FLOAT:  return isConst ? AtomicType::VaryingConstFloat  : AtomicType::VaryingFloat;
+    case TYPE_INT16:  return isConst ? VaryingConstInt16  : VaryingInt16;
-    case TYPE_INT64:  return isConst ? AtomicType::VaryingConstInt64  : AtomicType::VaryingInt64;
+    case TYPE_UINT16: return isConst ? VaryingConstUInt16 : VaryingUInt16;
-    case TYPE_UINT64: return isConst ? AtomicType::VaryingConstUInt64 : AtomicType::VaryingUInt64;
+    case TYPE_INT32:  return isConst ? VaryingConstInt32  : VaryingInt32;
-    case TYPE_DOUBLE: return isConst ? AtomicType::VaryingConstDouble : AtomicType::VaryingDouble;
+    case TYPE_UINT32: return isConst ? VaryingConstUInt32 : VaryingUInt32;
    case TYPE_FLOAT:  return isConst ? VaryingConstFloat  : VaryingFloat;
    case TYPE_INT64:  return isConst ? VaryingConstInt64  : VaryingInt64;
    case TYPE_UINT64: return isConst ? VaryingConstUInt64 : VaryingUInt64;
    case TYPE_DOUBLE: return isConst ? VaryingConstDouble : VaryingDouble;
    default:          FATAL("Logic error in AtomicType::GetAsVaryingType()");
    }
    return NULL;
@@ -236,13 +276,17 @@ AtomicType::GetAsUniformType() const {
    switch (basicType) {
    case TYPE_VOID:   return this;
-    case TYPE_BOOL:   return isConst ? AtomicType::UniformConstBool : AtomicType::UniformBool;
+    case TYPE_BOOL:   return isConst ? UniformConstBool   : UniformBool;
-    case TYPE_INT32:  return isConst ? AtomicType::UniformConstInt32 : AtomicType::UniformInt32;
+    case TYPE_INT8:   return isConst ? UniformConstInt8   : UniformInt8;
-    case TYPE_UINT32: return isConst ? AtomicType::UniformConstUInt32 : AtomicType::UniformUInt32;
+    case TYPE_UINT8:  return isConst ? UniformConstUInt8  : UniformUInt8;
-    case TYPE_FLOAT:  return isConst ? AtomicType::UniformConstFloat : AtomicType::UniformFloat;
+    case TYPE_INT16:  return isConst ? UniformConstInt16  : UniformInt16;
-    case TYPE_INT64:  return isConst ? AtomicType::UniformConstInt64 : AtomicType::UniformInt64;
+    case TYPE_UINT16: return isConst ? UniformConstUInt16 : UniformUInt16;
-    case TYPE_UINT64: return isConst ? AtomicType::UniformConstUInt64 : AtomicType::UniformUInt64;
+    case TYPE_INT32:  return isConst ? UniformConstInt32  : UniformInt32;
-    case TYPE_DOUBLE: return isConst ? AtomicType::UniformConstDouble : AtomicType::UniformDouble;
+    case TYPE_UINT32: return isConst ? UniformConstUInt32 : UniformUInt32;
    case TYPE_FLOAT:  return isConst ? UniformConstFloat  : UniformFloat;
    case TYPE_INT64:  return isConst ? UniformConstInt64  : UniformInt64;
    case TYPE_UINT64: return isConst ? UniformConstUInt64 : UniformUInt64;
    case TYPE_DOUBLE: return isConst ? UniformConstDouble : UniformDouble;
    default:          FATAL("Logic error in AtomicType::GetAsUniformType()");
    }
    return NULL;
@@ -267,6 +311,10 @@ AtomicType::GetString() const {
    switch (basicType) {
    case TYPE_VOID:   ret += "void";            break;
    case TYPE_BOOL:   ret += "bool";            break;
    case TYPE_INT8:   ret += "int8";            break;
    case TYPE_UINT8:  ret += "unsigned int8";   break;
    case TYPE_INT16:  ret += "int16";           break;
    case TYPE_UINT16: ret += "unsigned int16";  break;
    case TYPE_INT32:  ret += "int32";           break;
    case TYPE_UINT32: ret += "unsigned int32";  break;
    case TYPE_FLOAT:  ret += "float";           break;
@@ -288,6 +336,10 @@ AtomicType::Mangle() const {
    switch (basicType) {
    case TYPE_VOID:   ret += "v"; break;
    case TYPE_BOOL:   ret += "b"; break;
    case TYPE_INT8:   ret += "t"; break;
    case TYPE_UINT8:  ret += "T"; break;
    case TYPE_INT16:  ret += "s"; break;
    case TYPE_UINT16: ret += "S"; break;
    case TYPE_INT32:  ret += "i"; break;
    case TYPE_UINT32: ret += "u"; break;
    case TYPE_FLOAT:  ret += "f"; break;
@@ -309,12 +361,16 @@ AtomicType::GetCDeclaration(const std::string &name) const {
    switch (basicType) {
    case TYPE_VOID:   ret += "void";     break;
    case TYPE_BOOL:   ret += "bool";     break;
    case TYPE_INT8:   ret += "int8_t";   break;
    case TYPE_UINT8:  ret += "uint8_t";  break;
    case TYPE_INT16:  ret += "int16_t";  break;
    case TYPE_UINT16: ret += "uint16_t"; break;
    case TYPE_INT32:  ret += "int32_t";  break;
    case TYPE_UINT32: ret += "uint32_t"; break;
    case TYPE_FLOAT:  ret += "float";    break;
    case TYPE_DOUBLE: ret += "double";   break;
    case TYPE_INT64:  ret += "int64_t";  break;
    case TYPE_UINT64: ret += "uint64_t"; break;
    case TYPE_DOUBLE: ret += "double";   break;
    default: FATAL("Logic error in AtomicType::GetCDeclaration()");
    }
@@ -333,6 +389,12 @@ AtomicType::LLVMType(llvm::LLVMContext *ctx) const {
        return llvm::Type::getVoidTy(*ctx);
    case TYPE_BOOL:
        return isUniform ? LLVMTypes::BoolType : LLVMTypes::BoolVectorType;
    case TYPE_INT8:
    case TYPE_UINT8:
        return isUniform ? LLVMTypes::Int8Type : LLVMTypes::Int8VectorType;
    case TYPE_INT16:
    case TYPE_UINT16:
        return isUniform ? LLVMTypes::Int16Type : LLVMTypes::Int16VectorType;
    case TYPE_INT32:
    case TYPE_UINT32:
        return isUniform ? LLVMTypes::Int32Type : LLVMTypes::Int32VectorType;
@@ -364,6 +426,22 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
            return m->diBuilder->createBasicType("bool", 32 /* size */, 32 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
            break;
        case TYPE_INT8:
            return m->diBuilder->createBasicType("int8", 8 /* size */, 8 /* align */,
                                                 llvm::dwarf::DW_ATE_signed);
            break;
        case TYPE_UINT8:
            return m->diBuilder->createBasicType("uint8", 8 /* size */, 8 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
            break;
        case TYPE_INT16:
            return m->diBuilder->createBasicType("int16", 16 /* size */, 16 /* align */,
                                                 llvm::dwarf::DW_ATE_signed);
            break;
        case TYPE_UINT16:
            return m->diBuilder->createBasicType("uint16", 16 /* size */, 16 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
            break;
        case TYPE_INT32:
            return m->diBuilder->createBasicType("int32", 32 /* size */, 32 /* align */,
                                                 llvm::dwarf::DW_ATE_signed);
--- a/type.h
+++ b/type.h
@@ -210,6 +210,10 @@ public:
    enum BasicType {
        TYPE_VOID,
        TYPE_BOOL,
        TYPE_INT8,
        TYPE_UINT8,
        TYPE_INT16,
        TYPE_UINT16,
        TYPE_INT32,
        TYPE_UINT32,
        TYPE_FLOAT,
@@ -221,14 +225,22 @@ public:
    const BasicType basicType;
    static const AtomicType *UniformBool, *VaryingBool;
    static const AtomicType *UniformInt8, *VaryingInt8;
    static const AtomicType *UniformInt16, *VaryingInt16;
    static const AtomicType *UniformInt32, *VaryingInt32;
    static const AtomicType *UniformUInt8, *VaryingUInt8;
    static const AtomicType *UniformUInt16, *VaryingUInt16;
    static const AtomicType *UniformUInt32, *VaryingUInt32;
    static const AtomicType *UniformFloat, *VaryingFloat;
    static const AtomicType *UniformInt64, *VaryingInt64;
    static const AtomicType *UniformUInt64, *VaryingUInt64;
    static const AtomicType *UniformDouble, *VaryingDouble;
    static const AtomicType *UniformConstBool, *VaryingConstBool;
    static const AtomicType *UniformConstInt8, *VaryingConstInt8;
    static const AtomicType *UniformConstInt16, *VaryingConstInt16;
    static const AtomicType *UniformConstInt32, *VaryingConstInt32;
    static const AtomicType *UniformConstUInt8, *VaryingConstUInt8;
    static const AtomicType *UniformConstUInt16, *VaryingConstUInt16;
    static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
    static const AtomicType *UniformConstFloat, *VaryingConstFloat;
    static const AtomicType *UniformConstInt64, *VaryingConstInt64;