Add support for int8/int16 types. Addresses issues #9 and #42.

2011-07-21 06:57:40 +01:00
parent 2d573acd17
commit bba7211654
64 changed files with 2317 additions and 885 deletions
--- a/2
+++ b/2
@@ -15,7 +15,7 @@ LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
 LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/)

 BUILD_DATE=$(shell date +%Y%m%d)
-BUILD_VERSION=$(shell git log | head -1)
+BUILD_VERSION=$(shell git log --abbrev-commit --abbrev=16 | head -1)

 CXX=g++
 CPP=cpp
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -78,8 +78,14 @@ static const Type *
 lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    if (t == LLVMTypes::VoidType)
        return AtomicType::Void;
+
+    // uniform
    else if (t == LLVMTypes::BoolType)
        return AtomicType::UniformBool;
+    else if (t == LLVMTypes::Int8Type)
+        return intAsUnsigned ? AtomicType::UniformUInt8 : AtomicType::UniformInt8;
+    else if (t == LLVMTypes::Int16Type)
+        return intAsUnsigned ? AtomicType::UniformUInt16 : AtomicType::UniformInt16;
    else if (t == LLVMTypes::Int32Type)
        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
    else if (t == LLVMTypes::FloatType)
@@ -88,6 +94,12 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::UniformDouble;
    else if (t == LLVMTypes::Int64Type)
        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
+
+    // varying
+    else if (t == LLVMTypes::Int8VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt8 : AtomicType::VaryingInt8;
+    else if (t == LLVMTypes::Int16VectorType)
+        return intAsUnsigned ? AtomicType::VaryingUInt16 : AtomicType::VaryingInt16;
    else if (t == LLVMTypes::Int32VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
    else if (t == LLVMTypes::FloatVectorType)
@@ -96,6 +108,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return AtomicType::VaryingDouble;
    else if (t == LLVMTypes::Int64VectorType)
        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
+
+    // pointers to uniform
+    else if (t == LLVMTypes::Int8PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                                 AtomicType::UniformInt8, false);
+    else if (t == LLVMTypes::Int16PointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                                 AtomicType::UniformInt16, false);
    else if (t == LLVMTypes::Int32PointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
                                                 AtomicType::UniformInt32, false);
@@ -106,6 +126,14 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return new ReferenceType(AtomicType::UniformFloat, false);
    else if (t == LLVMTypes::DoublePointerType)
        return new ReferenceType(AtomicType::UniformDouble, false);
+
+    // pointers to varying
+    else if (t == LLVMTypes::Int8VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                                 AtomicType::VaryingInt8, false);
+    else if (t == LLVMTypes::Int16VectorPointerType)
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                                 AtomicType::VaryingInt16, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
                                                 AtomicType::VaryingInt32, false);
@@ -116,6 +144,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
        return new ReferenceType(AtomicType::VaryingFloat, false);
    else if (t == LLVMTypes::DoubleVectorPointerType)
        return new ReferenceType(AtomicType::VaryingDouble, false);
+
+    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);

@@ -239,10 +269,49 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    }
 }

+
+static void
+lDeclarePG(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
+           const char *name) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";
+
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               name, module);
+    func->setOnlyReadsMemory(true);
+    func->setDoesNotThrow(true);
+}
+
+
+static void
+lDeclarePGBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
+             const char *name) {
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::VoidPointerType);
+    argTypes.push_back(LLVMTypes::Int32VectorType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = llvm::FunctionType::get(vecType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               name, module);
+    func->setOnlyReadsMemory(true);
+    func->setDoesNotThrow(true);
+}
+
+
 /** Declare the 'pseudo-gather' functions.  When the ispc front-end needs
    to perform a gather, it generates a call to one of these functions,
    which have signatures:
    
+    varying int8  __pseudo_gather(varying int8 *, mask)
+    varying int16 __pseudo_gather(varying int16 *, mask)
    varying int32 __pseudo_gather(varying int32 *, mask)
    varying int64 __pseudo_gather(varying int64 *, mask)

@@ -253,6 +322,10 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
    front-end to be relatively simple in how it emits address calculation
    for gathers.

+    varying int8  __pseudo_gather_base_offsets_8(uniform int8 *base, 
+                                                 int32 offsets, mask)
+    varying int16 __pseudo_gather_base_offsets_16(uniform int16 *base, 
+                                                  int32 offsets, mask)
    varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, 
                                                  int32 offsets, mask)
    varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, 
@@ -264,49 +337,54 @@ lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) {
 */
 static void
 lDeclarePseudoGathers(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
+    lDeclarePG(module, LLVMTypes::Int8VectorType, "__pseudo_gather_8");
+    lDeclarePG(module, LLVMTypes::Int16VectorType, "__pseudo_gather_16");
+    lDeclarePG(module, LLVMTypes::Int32VectorType, "__pseudo_gather_32");
+    lDeclarePG(module, LLVMTypes::Int64VectorType, "__pseudo_gather_64");

-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+    lDeclarePGBO(module, LLVMTypes::Int8VectorType,
+                 "__pseudo_gather_base_offsets_8");
+    lDeclarePGBO(module, LLVMTypes::Int16VectorType,
+                 "__pseudo_gather_base_offsets_16");
+    lDeclarePGBO(module, LLVMTypes::Int32VectorType,
+                 "__pseudo_gather_base_offsets_32");
+    lDeclarePGBO(module, LLVMTypes::Int64VectorType,
+                 "__pseudo_gather_base_offsets_64");
+}

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);

-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
+static void
+lDeclarePS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType,
+           const char *name) {
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::VoidPointerVectorType);
+    argTypes.push_back(vecType);
+    argTypes.push_back(LLVMTypes::MaskType);

-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               name, module);
+    func->setDoesNotThrow(true);
+}

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_gather_base_offsets_32", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);

-        fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false);
-        func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                      "__pseudo_gather_base_offsets_64", module);
-        func->setOnlyReadsMemory(true);
-        func->setDoesNotThrow(true);
-    }
+static void
+lDeclarePSBO(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *vecType, 
+             const char *name) {
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    argTypes.push_back(LLVMTypes::VoidPointerType);
+    argTypes.push_back(LLVMTypes::Int32VectorType);
+    argTypes.push_back(vecType);
+    argTypes.push_back(LLVMTypes::MaskType);
+
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func =
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               name, module);
+    func->setDoesNotThrow(true);
 }


@@ -314,16 +392,22 @@ lDeclarePseudoGathers(llvm::Module *module) {
    we also declare (but never define) pseudo-scatter instructions with
    signatures:

+    void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask)
+    void __pseudo_scatter_16(varying int16 *, varying int16 values, mask)
    void __pseudo_scatter_32(varying int32 *, varying int32 values, mask)
    void __pseudo_scatter_64(varying int64 *, varying int64 values, mask)

    The GatherScatterFlattenOpt optimization pass also finds these and
    transforms them to scatters like:

+    void __pseudo_scatter_base_offsets_8(uniform int8 *base, 
+                    varying int32 offsets, varying int8 values, mask)
+    void __pseudo_scatter_base_offsets_16(uniform int16 *base, 
+                    varying int32 offsets, varying int16 values, mask)
    void __pseudo_scatter_base_offsets_32(uniform int32 *base, 
                    varying int32 offsets, varying int32 values, mask)
    void __pseudo_scatter_base_offsets_64(uniform int64 *base, 
-                    varying int62 offsets, varying int64 values, mask)
+                    varying int32 offsets, varying int64 values, mask)

    And the GSImprovementsPass in turn converts these to actual native
    scatters or masked stores.  
@@ -333,67 +417,49 @@ lDeclarePseudoScatters(llvm::Module *module) {
    SourcePos noPos;
    noPos.name = "__stdlib";

-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+    lDeclarePS(module, LLVMTypes::Int8VectorType, "__pseudo_scatter_8");
+    lDeclarePS(module, LLVMTypes::Int16VectorType, "__pseudo_scatter_16");
+    lDeclarePS(module, LLVMTypes::Int32VectorType, "__pseudo_scatter_32");
+    lDeclarePS(module, LLVMTypes::Int64VectorType, "__pseudo_scatter_64");

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerVectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+    lDeclarePSBO(module, LLVMTypes::Int8VectorType, 
+                 "__pseudo_scatter_base_offsets_8");
+    lDeclarePSBO(module, LLVMTypes::Int16VectorType, 
+                 "__pseudo_scatter_base_offsets_16");
+    lDeclarePSBO(module, LLVMTypes::Int32VectorType, 
+                 "__pseudo_scatter_base_offsets_32");
+    lDeclarePSBO(module, LLVMTypes::Int64VectorType, 
+                 "__pseudo_scatter_base_offsets_64");
+}

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_64", module);
-        func->setDoesNotThrow(true);
-    }

-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+static void
+lDeclarePMS(llvm::Module *module, LLVM_TYPE_CONST llvm::Type *lvalueType, 
+            LLVM_TYPE_CONST llvm::Type *rvalueType, const char *name) {
+    SourcePos noPos;
+    noPos.name = "__stdlib";

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_32", module);
-        func->setDoesNotThrow(true);
-    }
-    {
-        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(LLVMTypes::VoidPointerType);
-        argTypes.push_back(LLVMTypes::Int32VectorType);
-        argTypes.push_back(LLVMTypes::Int64VectorType);
-        argTypes.push_back(LLVMTypes::MaskType);
+    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
+    argTypes.push_back(lvalueType);
+    argTypes.push_back(rvalueType);
+    argTypes.push_back(LLVMTypes::MaskType);

-        llvm::FunctionType *fType = 
-            llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-        llvm::Function *func =
-            llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                                   "__pseudo_scatter_base_offsets_64", module);
-        func->setDoesNotThrow(true);
-    }
+    llvm::FunctionType *fType = 
+        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
+    llvm::Function *func = 
+        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
+                               name, module);
+    func->setDoesNotThrow(true);
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    func->setDoesNotCapture(1, true);
 }


 /** This function declares placeholder masked store functions for the
    front-end to use.

+    void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask)
+    void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask)
    void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask)
    void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask)

@@ -403,40 +469,14 @@ lDeclarePseudoScatters(llvm::Module *module) {
 */
 static void
 lDeclarePseudoMaskedStore(llvm::Module *module) {
-    SourcePos noPos;
-    noPos.name = "__stdlib";
-
-    {
-    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int32VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int32VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_32", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
-    }
-
-    {
-    std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-    argTypes.push_back(LLVMTypes::Int64VectorPointerType);
-    argTypes.push_back(LLVMTypes::Int64VectorType);
-    argTypes.push_back(LLVMTypes::MaskType);
-
-    llvm::FunctionType *fType = 
-        llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false);
-    llvm::Function *func = 
-        llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage,
-                               "__pseudo_masked_store_64", module);
-    func->setDoesNotThrow(true);
-    func->addFnAttr(llvm::Attribute::AlwaysInline);
-    func->setDoesNotCapture(1, true);
-    }
+    lDeclarePMS(module, LLVMTypes::Int8VectorPointerType,
+                LLVMTypes::Int8VectorType, "__pseudo_masked_store_8");
+    lDeclarePMS(module, LLVMTypes::Int16VectorPointerType,
+                LLVMTypes::Int16VectorType, "__pseudo_masked_store_16");
+    lDeclarePMS(module, LLVMTypes::Int32VectorPointerType, 
+                LLVMTypes::Int32VectorType, "__pseudo_masked_store_32");
+    lDeclarePMS(module, LLVMTypes::Int64VectorPointerType, 
+                LLVMTypes::Int64VectorType, "__pseudo_masked_store_64");
 }


@@ -609,8 +649,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    // needed by the compiled program.
    { 
        std::vector<LLVM_TYPE_CONST llvm::Type *> argTypes;
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
-        argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0));
+        argTypes.push_back(LLVMTypes::VoidPointerType);
+        argTypes.push_back(LLVMTypes::VoidPointerType);
        argTypes.push_back(LLVMTypes::Int32Type);
        argTypes.push_back(LLVMTypes::Int32Type);
        llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, 
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1448,17 +1448,20 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
    llvm::Value *mask = GetMask();
    llvm::Function *gather = NULL;
    // Figure out which gather function to call based on the size of
-    // the elements; will need to generalize this for 8 and 16-bit
-    // types.
+    // the elements.
    if (retType == LLVMTypes::DoubleVectorType || 
        retType == LLVMTypes::Int64VectorType)
        gather = m->module->getFunction("__pseudo_gather_64");
-    else {
-        assert(retType == LLVMTypes::FloatVectorType || 
-               retType == LLVMTypes::Int32VectorType);
+    else if (retType == LLVMTypes::FloatVectorType || 
+             retType == LLVMTypes::Int32VectorType)
        gather = m->module->getFunction("__pseudo_gather_32");
+    else if (retType == LLVMTypes::Int16VectorType)
+        gather = m->module->getFunction("__pseudo_gather_16");
+    else {
+        assert(retType == LLVMTypes::Int8VectorType);
+        gather = m->module->getFunction("__pseudo_gather_8");
    }
-    assert(gather);
+    assert(gather != NULL);

    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
    llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name);
@@ -1578,9 +1581,7 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
    rvalueType = rvalueType->GetAsNonConstType();

    llvm::Function *maskedStoreFunc = NULL;
-    // Figure out if we need a 32-bit or 64-bit masked store.  This
-    // will need to be generalized when/if 8 and 16-bit data types are
-    // added.
+    // Figure out if we need a 8, 16, 32 or 64-bit masked store.
    if (rvalueType == AtomicType::VaryingDouble || 
        rvalueType == AtomicType::VaryingInt64 ||
        rvalueType == AtomicType::VaryingUInt64) {
@@ -1590,13 +1591,11 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
                             "rvalue_to_int64");
    }
-    else {
-        assert(rvalueType == AtomicType::VaryingFloat ||
-               rvalueType == AtomicType::VaryingBool ||
-               rvalueType == AtomicType::VaryingInt32 ||
-               rvalueType == AtomicType::VaryingUInt32 ||
-               dynamic_cast<const EnumType *>(rvalueType) != NULL);
-
+    else if (rvalueType == AtomicType::VaryingFloat ||
+             rvalueType == AtomicType::VaryingBool ||
+             rvalueType == AtomicType::VaryingInt32 ||
+             rvalueType == AtomicType::VaryingUInt32 ||
+             dynamic_cast<const EnumType *>(rvalueType) != NULL) {
        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
                             "lvalue_to_int32vecptr");
@@ -1604,6 +1603,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
                                 "rvalue_to_int32");
    }
+    else if (rvalueType == AtomicType::VaryingInt16 ||
+             rvalueType == AtomicType::VaryingUInt16) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType, 
+                             "lvalue_to_int16vecptr");
+    }
+    else if (rvalueType == AtomicType::VaryingInt8 ||
+             rvalueType == AtomicType::VaryingUInt8) {
+        maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
+        lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType, 
+                             "lvalue_to_int8vecptr");
+    }

    std::vector<llvm::Value *> args;
    args.push_back(lvalue);
@@ -1668,14 +1679,15 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
        func = m->module->getFunction("__pseudo_scatter_64");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
    }
-    else {
-        // FIXME: if this hits, presumably it's due to needing int8 and/or
-        // int16 versions of scatter...
-        assert(type == LLVMTypes::FloatVectorType || 
-               type == LLVMTypes::Int32VectorType);
+    else if (type == LLVMTypes::FloatVectorType || 
+             type == LLVMTypes::Int32VectorType) {
        func = m->module->getFunction("__pseudo_scatter_32");
        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
    }
+    else if (type == LLVMTypes::Int16VectorType)
+        func = m->module->getFunction("__pseudo_scatter_16");
+    else if (type == LLVMTypes::Int8VectorType)
+        func = m->module->getFunction("__pseudo_scatter_8");
    assert(func != NULL);
    
    AddInstrumentationPoint("scatter");
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -427,7 +427,8 @@ The following identifiers are reserved as language keywords: ``bool``,
 ``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``,
 ``default``, ``do``, ``double``, ``else``, ``enum``, ``export``,
 ``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``,
-``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``,
+``int8``, ``int16``, ``int32``, ``int64``, ``launch``, ``print``,
+``reference``, ``return``,
 ``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``,
 ``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``,
 ``unsigned``, ``varying``, ``void``, ``volatile``, ``while``.
@@ -481,6 +482,10 @@ types.
 * ``void``: "empty" type representing no value.
 * ``bool``: boolean value; may be assigned ``true``, ``false``, or the
  value of a boolean expression.
+* ``int8``: 8-bit signed integer.
+* ``unsigned int8``: 8-bit unsigned integer.
+* ``int16``: 16-bit signed integer.
+* ``unsigned int16``: 16-bit unsigned integer.
 * ``int``: 32-bit signed integer; may also be specified as ``int32``.
 * ``unsigned int``: 32-bit unsigned integer; may also be specified as
  ``unsigned int32``.
@@ -497,7 +502,8 @@ general" of the two types, with the following precedence:

 ::

-  double > uint64 > int64 > float > uint32 > int32 > bool
+  double > uint64 > int64 > float > uint32 > int32 > 
+      uint16 > int16 > uint8 > int8 > bool

 In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to
 be converted to a ``double``, the addition to be performed, and a
@@ -1709,10 +1715,12 @@ the running program instances.

 ::

-    float broadcast(float value, uniform int index)
+    int8 broadcast(int8 value, uniform int index)
+    int16 broadcast(int16 value, uniform int index)
    int32 broadcast(int32 value, uniform int index)
-    double broadcast(double value, uniform int index)
    int64 broadcast(int64 value, uniform int index)
+    float broadcast(float value, uniform int index)
+    double broadcast(double value, uniform int index)

 The ``rotate()`` function allows each program instance to find the value of
 the given value that their neighbor ``offset`` steps away has.  For
@@ -1725,10 +1733,12 @@ provided offset value can be positive or negative, and may be greater than

 ::

-    float rotate(float value, uniform int offset)
+    int8 rotate(int8 value, uniform int offset)
+    int16 rotate(int16 value, uniform int offset)
    int32 rotate(int32 value, uniform int offset)
-    double rotate(double value, uniform int offset)
    int64 rotate(int64 value, uniform int offset)
+    float rotate(float value, uniform int offset)
+    double rotate(double value, uniform int offset)


 Finally, the ``shuffle()`` functions allow two variants of fully general
@@ -1739,10 +1749,12 @@ from which to get the value of ``value``.  The provided values for

 ::

-    float shuffle(float value, int permutation)
+    int8 shuffle(int8 value, int permutation)
+    int16 shuffle(int16 value, int permutation)
    int32 shuffle(int32 value, int permutation)
-    double shuffle(double value, int permutation)
    int64 shuffle(int64 value, int permutation)
+    float shuffle(float value, int permutation)
+    double shuffle(double value, int permutation)


 The second variant of ``shuffle()`` permutes over the extended vector that
@@ -1753,10 +1765,12 @@ of ``value1``, etc.)

 ::

-    float shuffle(float value0, float value1, int permutation)
+    int8 shuffle(int8 value0, int8 value1, int permutation)
+    int16 shuffle(int16 value0, int16 value1, int permutation)
    int32 shuffle(int32 value0, int32 value1, int permutation)
-    double shuffle(double value0, double value1, int permutation)
    int64 shuffle(int64 value0, int64 value1, int permutation)
+    float shuffle(float value0, float value1, int permutation)
+    double shuffle(double value0, double value1, int permutation)

 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.
@@ -1861,10 +1875,19 @@ where the ``i`` th element of ``x`` has been replaced with the value ``v``

 ::

+    uniform int8 extract(int8 x, uniform int i)
+    uniform int16 extract(int16 x, uniform int i)
+    uniform int32 extract(int32 x, uniform int i)
+    uniform int64 extract(int64 x, uniform int i)
    uniform float extract(float x, uniform int i)
-    uniform int extract(int x, uniform int i)
+
+::
+
+    int8 insert(int8 x, uniform int i, uniform int8 v)
+    int16 insert(int16 x, uniform int i, uniform int16 v)
+    int32 insert(int32 x, uniform int i, uniform int32 v)
+    int64 insert(int64 x, uniform int i, uniform int64 v)
    float insert(float x, uniform int i, uniform float v)
-    int insert(int x, uniform int i, uniform int v)


 Atomic Operations and Memory Fences
@@ -1948,41 +1971,6 @@ value ``true`` (rather than just having the value one).  The
    int sign_extend(bool value) 
    uniform int sign_extend(uniform bool value) 

-``ispc`` provides a number of bit/memory-level utility routines in its
-standard library as well.  It has routines that load from and store
-to 8-bit and 16-bit integer values stored in memory, converting to and from
-32-bit integers for use in computation in ``ispc`` code.  (These functions
-and this conversion step are necessary because ``ispc`` doesn't have native
-8-bit or 16-bit types in the language.)
-
-::
-
-    int load_from_int8(uniform int a[], uniform int offset)
-    unsigned int load_from_int8(uniform unsigned int a[],
-                                uniform int offset)
-    void store_to_int8(uniform int a[], uniform int offset, 
-                       int val)
-    void store_to_int8(uniform unsigned int a[], uniform int offset, 
-                       unsigned int val)
-    unsigned int load_from_int16(uniform int a[],
-                                 uniform int offset)
-    unsigned unsigned int load_from_int16(uniform unsigned int a[],
-                                          uniform int offset)
-    void store_to_int16(uniform int a[], uniform int offset, 
-                        int val)
-    void store_to_int16(uniform unsigned int a[], uniform int offset, 
-                        unsigned int val)
-
-There are three things to note in these functions.  First, note that these
-functions take either ``int`` or ``unsigned int`` arrays as parameters; you
-need to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side
-to ``int`` or ``unsigned int`` when passing them to ``ispc`` code.  Second,
-although the arrays are passed as 32-bit integers, in the array indexing
-calculation, with the ``offset`` parameter, they are treated as if they
-were ``int8`` or ``int16`` types (i.e. the offset treated as being in terms
-of number of 8 or 16-bit elements).  Third, note that the value of
-``programIndex`` is implicitly added to offset.
-
 The ``intbits()`` and ``floatbits()`` functions can be used to implement
 low-level floating-point bit twiddling.  For example, ``intbits()`` returns
 an ``unsigned int`` that is a bit-for-bit copy of the given ``float``
--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -190,7 +190,9 @@ int main(int argc, char *argv[]) {
        nodes[i].bounds[1].v[1] = b[4];
        nodes[i].bounds[1].v[2] = b[5];
        READ(nodes[i].offset, 1);
-        READ(nodes[i].primsAxis, 1);
+        READ(nodes[i].nPrimitives, 1);
+        READ(nodes[i].splitAxis, 1);
+        READ(nodes[i].pad, 1);
    }

    // And then read the triangles 
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -50,21 +50,11 @@ struct Triangle {
 struct LinearBVHNode {
    uniform float3 bounds[2];
    uniform unsigned int offset;     // num primitives for leaf, second child for interior
-    uniform unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+    uniform unsigned int8 nPrimitives;
+    uniform unsigned int8 splitAxis;
+    uniform unsigned int16 pad;
 };

-static inline uniform int nPrims(const reference LinearBVHNode node) {
-    return (node.primsAxis & 0xff);
-}
-
-static inline uniform int axis(const reference LinearBVHNode node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-static inline uniform bool isInterior(const reference LinearBVHNode node) {
-    return nPrims(node) == 0;
-}
-
 static inline float3 Cross(const float3 v1, const float3 v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -199,7 +189,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        LinearBVHNode node = nodes[nodeNum];
        if (any(BBoxIntersect(node.bounds, ray))) {
-            uniform unsigned int nPrimitives = nPrims(node);
+            uniform unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                uniform unsigned int primitivesOffset = node.offset;
@@ -213,7 +203,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
--- a/examples/rt/rt_serial.cpp
+++ b/examples/rt/rt_serial.cpp
@@ -75,30 +75,20 @@ struct Ray {
 namespace ispc {
    struct Triangle {
        float3 p[3];
-        int id;
+        int32_t id;
    };

    struct LinearBVHNode {
        float3 bounds[2];
-        unsigned int offset;     // primitives for leaf, second child for interior
-        unsigned int primsAxis;  // 0:7 nPrimitives, 8:15 split axis, 16:31 padding
+        int32_t offset;     // primitives for leaf, second child for interior
+        uint8_t nPrimitives;
+        uint8_t splitAxis;
+        uint16_t pad;
    };
 }

 using namespace ispc;

-inline int nPrims(const LinearBVHNode &node) {
-    return (node.primsAxis & 0xff);
-}
-
-inline int axis(const LinearBVHNode &node) {
-    return ((node.primsAxis >> 8) & 0xff);
-}
-
-inline bool isInterior(const LinearBVHNode &node) {
-    return nPrims(node) == 0;
-}
-
 inline float3 Cross(const float3 &v1, const float3 &v2) {
    float v1x = v1.x, v1y = v1.y, v1z = v1.z;
    float v2x = v2.x, v2y = v2.y, v2z = v2.z;
@@ -230,7 +220,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
        // Check ray against BVH node
        const LinearBVHNode &node = nodes[nodeNum];
        if (BBoxIntersect(node.bounds, ray)) {
-            unsigned int nPrimitives = nPrims(node);
+            unsigned int nPrimitives = node.nPrimitives;
            if (nPrimitives > 0) {
                // Intersect ray with primitives in leaf BVH node
                unsigned int primitivesOffset = node.offset;
@@ -244,7 +234,7 @@ bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[],
            }
            else {
                // Put far BVH node on _todo_ stack, advance to near node
-                if (r.dirIsNeg[axis(node)]) {
+                if (r.dirIsNeg[node.splitAxis]) {
                   todo[todoOffset++] = nodeNum + 1;
                   nodeNum = node.offset;
                }
--- a/expr.cpp
+++ b/expr.cpp
@@ -93,6 +93,10 @@ lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType,
                            SourcePos pos, const char *errorMsgBase) {
    switch (toAtomicType->basicType) {
    case AtomicType::TYPE_BOOL:
+    case AtomicType::TYPE_INT8:
+    case AtomicType::TYPE_UINT8:
+    case AtomicType::TYPE_INT16:
+    case AtomicType::TYPE_UINT16:
    case AtomicType::TYPE_INT32:
    case AtomicType::TYPE_UINT32:
    case AtomicType::TYPE_FLOAT:
@@ -101,6 +105,10 @@ lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType,
    case AtomicType::TYPE_DOUBLE:
        if ((int)toAtomicType->basicType < (int)fromAtomicType->basicType &&
            toAtomicType->basicType != AtomicType::TYPE_BOOL &&
+            !(toAtomicType->basicType == AtomicType::TYPE_INT8 && 
+              fromAtomicType->basicType == AtomicType::TYPE_UINT8) &&
+            !(toAtomicType->basicType == AtomicType::TYPE_INT16 && 
+              fromAtomicType->basicType == AtomicType::TYPE_UINT16) &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT32 && 
              fromAtomicType->basicType == AtomicType::TYPE_UINT32) &&
            !(toAtomicType->basicType == AtomicType::TYPE_INT64 && 
@@ -363,15 +371,33 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
                return (value != 0.) ? LLVMTrue : LLVMFalse;
            else
                return LLVMBoolVector(value != 0.);
-        case AtomicType::TYPE_UINT32: {
+        case AtomicType::TYPE_INT8: {
+            int i = (int)value;
+            assert((double)i == value);
+            return isUniform ? LLVMInt8(i) : LLVMInt8Vector(i);
+        }
+        case AtomicType::TYPE_UINT8: {
            unsigned int i = (unsigned int)value;
-            return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i);
+            return isUniform ? LLVMUInt8(i) : LLVMUInt8Vector(i);
+        }
+        case AtomicType::TYPE_INT16: {
+            int i = (int)value;
+            assert((double)i == value);
+            return isUniform ? LLVMInt16(i) : LLVMInt16Vector(i);
+        }
+        case AtomicType::TYPE_UINT16: {
+            unsigned int i = (unsigned int)value;
+            return isUniform ? LLVMUInt16(i) : LLVMUInt16Vector(i);
        }
        case AtomicType::TYPE_INT32: {
            int i = (int)value;
            assert((double)i == value);
            return isUniform ? LLVMInt32(i) : LLVMInt32Vector(i);
        }
+        case AtomicType::TYPE_UINT32: {
+            unsigned int i = (unsigned int)value;
+            return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i);
+        }
        case AtomicType::TYPE_FLOAT:
            return isUniform ? LLVMFloat((float)value) : 
                               LLVMFloatVector((float)value);
@@ -590,14 +616,13 @@ UnaryExpr::Optimize() {
    const Type *type = constExpr->GetType();
    bool isEnumType = dynamic_cast<const EnumType *>(type) != NULL;

-    if (type == AtomicType::UniformInt64 || 
-        type == AtomicType::VaryingInt64 ||
-        type == AtomicType::UniformUInt64 || 
-        type == AtomicType::VaryingUInt64 ||
-        type == AtomicType::UniformConstInt64 || 
-        type == AtomicType::VaryingConstInt64 ||
-        type == AtomicType::UniformConstUInt64 || 
-        type == AtomicType::VaryingConstUInt64)
+    const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
+    if (baseType == AtomicType::UniformInt8 ||
+        baseType == AtomicType::UniformUInt8 ||
+        baseType == AtomicType::UniformInt16 ||
+        baseType == AtomicType::UniformUInt16 ||
+        baseType == AtomicType::UniformInt64 ||
+        baseType == AtomicType::UniformUInt64)
        // FIXME: should handle these at some point; for now we only do
        // constant folding for bool, int32 and float types...
        return this;
@@ -3058,6 +3083,86 @@ MemberExpr::getCandidateNearMatches() const {
 ///////////////////////////////////////////////////////////////////////////
 // ConstExpr

+ConstExpr::ConstExpr(const Type *t, int8_t i, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt8);
+    int8Val[0] = i;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int8_t *i, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt8 || 
+           type == AtomicType::VaryingConstInt8);
+    for (int j = 0; j < Count(); ++j)
+        int8Val[j] = i[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint8_t u, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformUInt8);
+    uint8Val[0] = u;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint8_t *u, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstUInt8 || 
+           type == AtomicType::VaryingConstUInt8);
+    for (int j = 0; j < Count(); ++j)
+        uint8Val[j] = u[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int16_t i, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt16);
+    int16Val[0] = i;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, int16_t *i, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstInt16 || 
+           type == AtomicType::VaryingConstInt16);
+    for (int j = 0; j < Count(); ++j)
+        int16Val[j] = i[j];
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint16_t u, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformUInt16);
+    uint16Val[0] = u;
+}
+
+
+ConstExpr::ConstExpr(const Type *t, uint16_t *u, SourcePos p) 
+  : Expr(p) {
+    type = t;
+    type = type->GetAsConstType();
+    assert(type == AtomicType::UniformConstUInt16 || 
+           type == AtomicType::VaryingConstUInt16);
+    for (int j = 0; j < Count(); ++j)
+        uint16Val[j] = u[j];
+}
+
+
 ConstExpr::ConstExpr(const Type *t, int32_t i, SourcePos p) 
  : Expr(p) {
    type = t;
@@ -3212,6 +3317,22 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v)
        for (int i = 0; i < Count(); ++i)
            boolVal[i] = (v[i] != 0.);
        break;
+    case AtomicType::TYPE_INT8:
+        for (int i = 0; i < Count(); ++i)
+            int8Val[i] = (int)v[i];
+        break;
+    case AtomicType::TYPE_UINT8:
+        for (int i = 0; i < Count(); ++i)
+            uint8Val[i] = (unsigned int)v[i];
+        break;
+    case AtomicType::TYPE_INT16:
+        for (int i = 0; i < Count(); ++i)
+            int16Val[i] = (int)v[i];
+        break;
+    case AtomicType::TYPE_UINT16:
+        for (int i = 0; i < Count(); ++i)
+            uint16Val[i] = (unsigned int)v[i];
+        break;
    case AtomicType::TYPE_INT32:
        for (int i = 0; i < Count(); ++i)
            int32Val[i] = (int)v[i];
@@ -3270,6 +3391,18 @@ ConstExpr::GetValue(FunctionEmitContext *ctx) const {
            return LLVMBoolVector(boolVal);
        else
            return boolVal[0] ? LLVMTrue : LLVMFalse;
+    case AtomicType::TYPE_INT8:
+        return isVarying ? LLVMInt8Vector(int8Val) : 
+                           LLVMInt8(int8Val[0]);
+    case AtomicType::TYPE_UINT8:
+        return isVarying ? LLVMUInt8Vector(uint8Val) : 
+                           LLVMUInt8(uint8Val[0]);
+    case AtomicType::TYPE_INT16:
+        return isVarying ? LLVMInt16Vector(int16Val) : 
+                           LLVMInt16(int16Val[0]);
+    case AtomicType::TYPE_UINT16:
+        return isVarying ? LLVMUInt16Vector(uint16Val) : 
+                           LLVMUInt16(uint16Val[0]);
    case AtomicType::TYPE_INT32:
        return isVarying ? LLVMInt32Vector(int32Val) : 
                           LLVMInt32(int32Val[0]);
@@ -3351,6 +3484,10 @@ int
 ConstExpr::AsInt64(int64_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
@@ -3368,6 +3505,10 @@ int
 ConstExpr::AsUInt64(uint64_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
@@ -3385,6 +3526,10 @@ int
 ConstExpr::AsDouble(double *d, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   d, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   d, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  d, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  d, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, d, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  d, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, d, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  d, Count(), forceVarying); break;
@@ -3402,6 +3547,10 @@ int
 ConstExpr::AsFloat(float *fp, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  fp, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, fp, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  fp, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, fp, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  fp, Count(), forceVarying); break;
@@ -3419,6 +3568,10 @@ int
 ConstExpr::AsBool(bool *b, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   b, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   b, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  b, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  b, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, b, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  b, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, b, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  b, Count(), forceVarying); break;
@@ -3432,10 +3585,98 @@ ConstExpr::AsBool(bool *b, bool forceVarying) const {
 }


+int
+ConstExpr::AsInt8(int8_t *ip, bool forceVarying) const {
+    switch (getBasicType()) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsUInt8(uint8_t *up, bool forceVarying) const {
+    switch (getBasicType()) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsInt16(int16_t *ip, bool forceVarying) const {
+    switch (getBasicType()) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
+int
+ConstExpr::AsUInt16(uint16_t *up, bool forceVarying) const {
+    switch (getBasicType()) {
+    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT64:  lConvert(int64Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break;
+    default:
+        FATAL("unimplemented const type");
+    }
+    return Count();
+}
+
+
 int
 ConstExpr::AsInt32(int32_t *ip, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  ip, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  ip, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  ip, Count(), forceVarying); break;
@@ -3453,6 +3694,10 @@ int
 ConstExpr::AsUInt32(uint32_t *up, bool forceVarying) const {
    switch (getBasicType()) {
    case AtomicType::TYPE_BOOL:   lConvert(boolVal,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT8:   lConvert(int8Val,   up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT8:  lConvert(uint8Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_INT16:  lConvert(int16Val,  up, Count(), forceVarying); break;
+    case AtomicType::TYPE_UINT16: lConvert(uint16Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_INT32:  lConvert(int32Val,  up, Count(), forceVarying); break;
    case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break;
    case AtomicType::TYPE_FLOAT:  lConvert(floatVal,  up, Count(), forceVarying); break;
@@ -3488,6 +3733,40 @@ ConstExpr::GetConstant(const Type *type) const {
        else
            return LLVMBoolVector(bv);
    }
+    else if (type == AtomicType::UniformInt8 || type == AtomicType::VaryingInt8) {
+        int8_t iv[ISPC_MAX_NVEC];
+        AsInt8(iv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMInt8(iv[0]);
+        else
+            return LLVMInt8Vector(iv);
+    }
+    else if (type == AtomicType::UniformUInt8 || type == AtomicType::VaryingUInt8 ||
+             dynamic_cast<const EnumType *>(type) != NULL) {
+        uint8_t uiv[ISPC_MAX_NVEC];
+        AsUInt8(uiv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMUInt8(uiv[0]);
+        else
+            return LLVMUInt8Vector(uiv);
+    }
+    else if (type == AtomicType::UniformInt16 || type == AtomicType::VaryingInt16) {
+        int16_t iv[ISPC_MAX_NVEC];
+        AsInt16(iv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMInt16(iv[0]);
+        else
+            return LLVMInt16Vector(iv);
+    }
+    else if (type == AtomicType::UniformUInt16 || type == AtomicType::VaryingUInt16 ||
+             dynamic_cast<const EnumType *>(type) != NULL) {
+        uint16_t uiv[ISPC_MAX_NVEC];
+        AsUInt16(uiv, type->IsVaryingType());
+        if (type->IsUniformType())
+            return LLVMUInt16(uiv[0]);
+        else
+            return LLVMUInt16Vector(uiv);
+    }
    else if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) {
        int32_t iv[ISPC_MAX_NVEC];
        AsInt32(iv, type->IsVaryingType());
@@ -3564,6 +3843,18 @@ ConstExpr::Print() const {
        case AtomicType::TYPE_BOOL:
            printf("%s", boolVal[i] ? "true" : "false");
            break;
+        case AtomicType::TYPE_INT8:
+            printf("%d", (int)int8Val[i]);
+            break;
+        case AtomicType::TYPE_UINT8:
+            printf("%u", (int)uint8Val[i]);
+            break;
+        case AtomicType::TYPE_INT16:
+            printf("%d", (int)int16Val[i]);
+            break;
+        case AtomicType::TYPE_UINT16:
+            printf("%u", (int)uint16Val[i]);
+            break;
        case AtomicType::TYPE_INT32:
            printf("%d", int32Val[i]);
            break;
@@ -3637,11 +3928,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int
                                 exprVal, targetType, "bool2float");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_INT64:
            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int to float
                                 exprVal, targetType, "int2float");
            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
            if (fromType->IsVaryingType())
@@ -3675,11 +3970,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double
                                 exprVal, targetType, "bool2double");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_INT64:
            cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int
                                 exprVal, targetType, "int2double");
            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
        case AtomicType::TYPE_UINT64:
            if (fromType->IsVaryingType())
@@ -3699,6 +3998,170 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        }
        break;
    }
+    case AtomicType::TYPE_INT8: {
+        LLVM_TYPE_CONST llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int8Type :
+                                        LLVMTypes::Int8VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
+            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_UINT8:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_INT16:
+        case AtomicType::TYPE_UINT16:
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int8");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "float2int");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "double2int");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_UINT8: {
+        LLVM_TYPE_CONST llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int8Type :
+                                        LLVMTypes::Int8VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
+            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_UINT8:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_INT16:
+        case AtomicType::TYPE_UINT16:
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint8");
+            break;
+        case AtomicType::TYPE_FLOAT:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "float2uint");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "double2uint");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_INT16: {
+        LLVM_TYPE_CONST llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int16Type :
+                                        LLVMTypes::Int16VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
+            break;
+        case AtomicType::TYPE_INT8:
+            cast = ctx->SExtInst(exprVal, targetType, "int2int16");
+            break;
+        case AtomicType::TYPE_UINT8:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
+            break;
+        case AtomicType::TYPE_INT16:
+        case AtomicType::TYPE_UINT16:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_FLOAT:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "float2int");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_int16");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
+                                 exprVal, targetType, "double2int");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
+    case AtomicType::TYPE_UINT16: {
+        LLVM_TYPE_CONST llvm::Type *targetType = 
+            fromType->IsUniformType() ? LLVMTypes::Int16Type :
+                                        LLVMTypes::Int16VectorType;
+        switch (fromType->basicType) {
+        case AtomicType::TYPE_BOOL:
+            if (fromType->IsVaryingType() && 
+                LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
+            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint16");
+            break;
+        case AtomicType::TYPE_INT8:
+            cast = ctx->SExtInst(exprVal, targetType, "uint2uint16");
+            break;
+        case AtomicType::TYPE_UINT8:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint16");
+            break;            
+        case AtomicType::TYPE_INT16:
+        case AtomicType::TYPE_UINT16:
+            cast = exprVal;
+            break;
+        case AtomicType::TYPE_FLOAT:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from float to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "float2uint");
+            break;
+        case AtomicType::TYPE_INT32:
+        case AtomicType::TYPE_UINT32:
+        case AtomicType::TYPE_INT64:
+        case AtomicType::TYPE_UINT64:
+            cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint16");
+            break;
+        case AtomicType::TYPE_DOUBLE:
+            if (fromType->IsVaryingType())
+                PerformanceWarning(pos, "Conversion from double to unsigned int is slow. "
+                                   "Use \"int\" if possible");
+            cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int
+                                 exprVal, targetType, "double2uint");
+            break;
+        default:
+            FATAL("unimplemented");
+        }
+        break;
+    }
    case AtomicType::TYPE_INT32: {
        LLVM_TYPE_CONST llvm::Type *targetType = 
            fromType->IsUniformType() ? LLVMTypes::Int32Type :
@@ -3710,6 +4173,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
+            cast = ctx->SExtInst(exprVal, targetType, "int2int32");
+            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint32");
+            break;
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
            cast = exprVal;
@@ -3742,6 +4213,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
+            cast = ctx->SExtInst(exprVal, targetType, "uint2uint");
+            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
+            cast = ctx->ZExtInst(exprVal, targetType, "uint2uint");
+            break;            
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32:
            cast = exprVal;
@@ -3780,11 +4259,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2int64");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int32_to_int64");
+            cast = ctx->SExtInst(exprVal, targetType, "int_to_int64");
            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_int64");
+            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_int64");
            break;
        case AtomicType::TYPE_FLOAT:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
@@ -3796,7 +4279,7 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
            break;
        case AtomicType::TYPE_DOUBLE:
            cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int
-                                 exprVal, targetType, "double2int");
+                                 exprVal, targetType, "double2int64");
            break;
        default:
            FATAL("unimplemented");
@@ -3814,11 +4297,15 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
                exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1");
            cast = ctx->ZExtInst(exprVal, targetType, "bool2uint");
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_INT16:
        case AtomicType::TYPE_INT32:
-            cast = ctx->SExtInst(exprVal, targetType, "int32_to_uint64");
+            cast = ctx->SExtInst(exprVal, targetType, "int_to_uint64");
            break;
+        case AtomicType::TYPE_UINT8:
+        case AtomicType::TYPE_UINT16:
        case AtomicType::TYPE_UINT32:
-            cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_uint64");
+            cast = ctx->ZExtInst(exprVal, targetType, "uint_to_uint64");
            break;
        case AtomicType::TYPE_FLOAT:
            if (fromType->IsVaryingType())
@@ -3848,6 +4335,22 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
        case AtomicType::TYPE_BOOL:
            cast = exprVal;
            break;
+        case AtomicType::TYPE_INT8:
+        case AtomicType::TYPE_UINT8: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt8(0) : 
+                (llvm::Value *)LLVMInt8Vector((int8_t)0);
+            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
+                                exprVal, zero, "cmpi0");
+            break;
+        }
+        case AtomicType::TYPE_INT16:
+        case AtomicType::TYPE_UINT16: {
+            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt16(0) : 
+                (llvm::Value *)LLVMInt16Vector((int16_t)0);
+            cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE,
+                                exprVal, zero, "cmpi0");
+            break;
+        }
        case AtomicType::TYPE_INT32:
        case AtomicType::TYPE_UINT32: {
            llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt32(0) : 
@@ -4195,6 +4698,26 @@ TypeCastExpr::Optimize() {
        constExpr->AsBool(bv, forceVarying);
        return new ConstExpr(toType, bv, pos);
    }
+    case AtomicType::TYPE_INT8: {
+        int8_t iv[ISPC_MAX_NVEC];
+        constExpr->AsInt8(iv, forceVarying);
+        return new ConstExpr(toType, iv, pos);
+    }
+    case AtomicType::TYPE_UINT8: {
+        uint8_t uv[ISPC_MAX_NVEC];
+        constExpr->AsUInt8(uv, forceVarying);
+        return new ConstExpr(toType, uv, pos);
+    }
+    case AtomicType::TYPE_INT16: {
+        int16_t iv[ISPC_MAX_NVEC];
+        constExpr->AsInt16(iv, forceVarying);
+        return new ConstExpr(toType, iv, pos);
+    }
+    case AtomicType::TYPE_UINT16: {
+        uint16_t uv[ISPC_MAX_NVEC];
+        constExpr->AsUInt16(uv, forceVarying);
+        return new ConstExpr(toType, uv, pos);
+    }
    case AtomicType::TYPE_INT32: {
        int32_t iv[ISPC_MAX_NVEC];
        constExpr->AsInt32(iv, forceVarying);
--- a/expr.h
+++ b/expr.h
@@ -325,6 +325,24 @@ private:
 */
 class ConstExpr : public Expr {
 public:
+    /** Create a ConstExpr from a uniform int8 value */
+    ConstExpr(const Type *t, int8_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int8 value */
+    ConstExpr(const Type *t, int8_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint8 value */
+    ConstExpr(const Type *t, uint8_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint8 value */
+    ConstExpr(const Type *t, uint8_t *u, SourcePos p);
+
+    /** Create a ConstExpr from a uniform int16 value */
+    ConstExpr(const Type *t, int16_t i, SourcePos p);
+    /** Create a ConstExpr from a varying int16 value */
+    ConstExpr(const Type *t, int16_t *i, SourcePos p);
+    /** Create a ConstExpr from a uniform uint16 value */
+    ConstExpr(const Type *t, uint16_t u, SourcePos p);
+    /** Create a ConstExpr from a varying uint16 value */
+    ConstExpr(const Type *t, uint16_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform int32 value */
    ConstExpr(const Type *t, int32_t i, SourcePos p);
    /** Create a ConstExpr from a varying int32 value */
@@ -333,14 +351,17 @@ public:
    ConstExpr(const Type *t, uint32_t u, SourcePos p);
    /** Create a ConstExpr from a varying uint32 value */
    ConstExpr(const Type *t, uint32_t *u, SourcePos p);
+
    /** Create a ConstExpr from a uniform float value */
    ConstExpr(const Type *t, float f, SourcePos p);
    /** Create a ConstExpr from a varying float value */
    ConstExpr(const Type *t, float *f, SourcePos p);
+
    /** Create a ConstExpr from a uniform double value */
    ConstExpr(const Type *t, double d, SourcePos p);
    /** Create a ConstExpr from a varying double value */
    ConstExpr(const Type *t, double *d, SourcePos p);
+
    /** Create a ConstExpr from a uniform int64 value */
    ConstExpr(const Type *t, int64_t i, SourcePos p);
    /** Create a ConstExpr from a varying int64 value */
@@ -349,10 +370,12 @@ public:
    ConstExpr(const Type *t, uint64_t i, SourcePos p);
    /** Create a ConstExpr from a varying uint64 value */
    ConstExpr(const Type *t, uint64_t *i, SourcePos p);
+
    /** Create a ConstExpr from a uniform bool value */
    ConstExpr(const Type *t, bool b, SourcePos p);
    /** Create a ConstExpr from a varying bool value */
    ConstExpr(const Type *t, bool *b, SourcePos p);
+
    /** Create a ConstExpr of the same type as the given old ConstExpr,
        with values given by the "vales" parameter. */
    ConstExpr(ConstExpr *old, double *values);
@@ -371,6 +394,30 @@ public:
        equal to the target vector width into the given pointer. */
    int AsBool(bool *, bool forceVarying = false) const;

+    /** Return the ConstExpr's values as int8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt8(int8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint8s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt8(uint8_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as int16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsInt16(int16_t *, bool forceVarying = false) const;
+
+    /** Return the ConstExpr's values as uint16s, doing type conversion
+        from the actual type if needed.  If forceVarying is true, then type
+        convert to 'varying' so as to always return a number of values
+        equal to the target vector width into the given pointer. */
+    int AsUInt16(uint16_t *, bool forceVarying = false) const;
+
    /** Return the ConstExpr's values as int32s, doing type conversion
        from the actual type if needed.  If forceVarying is true, then type
        convert to 'varying' so as to always return a number of values
@@ -417,6 +464,10 @@ private:

    const Type *type;
    union {
+        int8_t int8Val[ISPC_MAX_NVEC];
+        uint8_t uint8Val[ISPC_MAX_NVEC];
+        int16_t int16Val[ISPC_MAX_NVEC];
+        uint16_t uint16Val[ISPC_MAX_NVEC];
        int32_t int32Val[ISPC_MAX_NVEC];
        uint32_t uint32Val[ISPC_MAX_NVEC];
        bool boolVal[ISPC_MAX_NVEC];
--- a/failing_tests/shuffle2-10.ispc
+++ b/failing_tests/shuffle2-10.ispc
@@ -0,0 +1,16 @@
+
+/* failing due to llvm bug http://llvm.org/bugs/show_bug.cgi?id=10421 */
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 aa = aFOO[programIndex]; 
+    int8 bb = aa + programCount;
+    int8 shuf = shuffle(aa, bb, 2*programIndex+(int)b-5);
+//CO    print("%\n%\n%\n%\n", aa, bb, 2*programIndex+(int)b-5, shuf);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}
--- a/ispc_test.cpp
+++ b/ispc_test.cpp
@@ -158,38 +158,40 @@ static bool lRunTest(const char *fn) {
    }

    llvm::Function *func;
-    if ((func = module->getFunction("ISPCLaunch")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCLaunch);
-    if ((func = module->getFunction("ISPCSync")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCSync);
+#define DO_FUNC(FUNC ,FUNCNAME)                           \
+    if ((func = module->getFunction(FUNCNAME)) != NULL)   \
+        ee->addGlobalMapping(func, (void *)FUNC)
+    DO_FUNC(ISPCLaunch, "ISPCLaunch");
+    DO_FUNC(ISPCSync, "ISPCSync");
 #ifdef ISPC_IS_WINDOWS
-    if ((func = module->getFunction("ISPCMalloc")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCMalloc);
-    if ((func = module->getFunction("ISPCFree")) != NULL)
-        ee->addGlobalMapping(func, (void *)ISPCFree);
+    DO_FUNC(ISPCMalloc, "ISPCMalloc");
+    DO_FUNC(ISPCFree, "ISPCFree");
 #endif // ISPC_IS_WINDOWS
-    if ((func = module->getFunction("putchar")) != NULL)
-        ee->addGlobalMapping(func, (void *)putchar);
-    if ((func = module->getFunction("printf")) != NULL)
-        ee->addGlobalMapping(func, (void *)printf);
-    if ((func = module->getFunction("fflush")) != NULL)
-        ee->addGlobalMapping(func, (void *)fflush);
-    if ((func = module->getFunction("sinf")) != NULL)
-        ee->addGlobalMapping(func, (void *)sinf);
-    if ((func = module->getFunction("cosf")) != NULL)
-        ee->addGlobalMapping(func, (void *)cosf);
-    if ((func = module->getFunction("tanf")) != NULL)
-        ee->addGlobalMapping(func, (void *)tanf);
-    if ((func = module->getFunction("atanf")) != NULL)
-        ee->addGlobalMapping(func, (void *)atanf);
-    if ((func = module->getFunction("atan2f")) != NULL)
-        ee->addGlobalMapping(func, (void *)atan2f);
-    if ((func = module->getFunction("powf")) != NULL)
-        ee->addGlobalMapping(func, (void *)powf);
-    if ((func = module->getFunction("expf")) != NULL)
-        ee->addGlobalMapping(func, (void *)expf);
-    if ((func = module->getFunction("logf")) != NULL)
-        ee->addGlobalMapping(func, (void *)logf);
+    DO_FUNC(putchar, "putchar");
+    DO_FUNC(printf, "printf");
+    DO_FUNC(fflush, "fflush");
+    DO_FUNC(sinf, "sinf");
+    DO_FUNC(cosf, "cosf");
+    DO_FUNC(tanf, "tanf");
+    DO_FUNC(atanf, "atanf");
+    DO_FUNC(atan2f, "atan2f");
+    DO_FUNC(powf, "powf");
+    DO_FUNC(expf, "expf");
+    DO_FUNC(logf, "logf");
+    DO_FUNC(sin, "sin");
+    DO_FUNC(cos, "cos");
+    DO_FUNC(tan, "tan");
+    DO_FUNC(atan, "atan");
+    DO_FUNC(atan2, "atan2");
+    DO_FUNC(pow, "pow");
+    DO_FUNC(exp, "exp");
+    DO_FUNC(log, "log");
+    DO_FUNC(memset, "memset");
+#ifdef ISPC_IS_APPLE
+    DO_FUNC(memset_pattern4, "memset_pattern4");
+    DO_FUNC(memset_pattern8, "memset_pattern8");
+    DO_FUNC(memset_pattern16, "memset_pattern16");
+#endif

 #ifdef ISPC_HAVE_SVML
 #define DO_SVML(FUNC ,FUNCNAME)                           \
--- a/lex.ll
+++ b/lex.ll
@@ -104,6 +104,8 @@ goto { return TOKEN_GOTO; }
 if { return TOKEN_IF; }
 inline { return TOKEN_INLINE; }
 int { return TOKEN_INT; }
+int8 { return TOKEN_INT8; }
+int16 { return TOKEN_INT16; }
 int32 { return TOKEN_INT; }
 int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -41,28 +41,39 @@
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
+
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16Type = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64Type = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleType = NULL;
+
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64PointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoublePointerType = NULL;

 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::MaskType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
+
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int1VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int8VectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int16VectorType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int32VectorType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::Int64VectorType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
-LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int16VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int32VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
+
 LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;

 llvm::Constant *LLVMTrue = NULL;
@@ -75,16 +86,20 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
+
    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
    LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx);
    LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx);
-    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
    LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx);
-    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
-    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
+
+    LLVMTypes::Int8PointerType = llvm::PointerType::get(LLVMTypes::Int8Type, 0);
+    LLVMTypes::Int16PointerType = llvm::PointerType::get(LLVMTypes::Int16Type, 0);
+    LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0);
+    LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0);
+    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);

    // Note that both the mask and bool vectors are vector of int32s
@@ -95,18 +110,26 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    LLVMTypes::Int1VectorType = 
        llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth);
+    LLVMTypes::Int8VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int8Type, target.vectorWidth);
+    LLVMTypes::Int16VectorType = 
+        llvm::VectorType::get(LLVMTypes::Int16Type, target.vectorWidth);
    LLVMTypes::Int32VectorType = 
        llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth);
-    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
    LLVMTypes::Int64VectorType = 
        llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth);
-    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
    LLVMTypes::FloatVectorType = 
        llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth);
-    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorType = 
        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+
+    LLVMTypes::Int8VectorPointerType = llvm::PointerType::get(LLVMTypes::Int8VectorType, 0);
+    LLVMTypes::Int16VectorPointerType = llvm::PointerType::get(LLVMTypes::Int16VectorType, 0);
+    LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0);
+    LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0);
+    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
+
    LLVMTypes::VoidPointerVectorType = 
        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);

@@ -133,7 +156,36 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
 }


-llvm::ConstantInt *LLVMInt32(int32_t ival) {
+llvm::ConstantInt *
+LLVMInt8(int8_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt8(uint8_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt8Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt16(int16_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
+                                  true /*signed*/);
+}
+
+
+llvm::ConstantInt *
+LLVMUInt16(uint16_t ival) {
+    return llvm::ConstantInt::get(llvm::Type::getInt16Ty(*g->ctx), ival,
+                                  false /*unsigned*/);
+}
+
+
+llvm::ConstantInt *
+LLVMInt32(int32_t ival) {
    return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival,
                                  true /*signed*/);
 }
@@ -172,6 +224,82 @@ LLVMDouble(double dval) {
 }


+llvm::Constant *
+LLVMInt8Vector(int8_t ival) {
+    llvm::Constant *v = LLVMInt8(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt8Vector(const int8_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt8(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt8Vector(uint8_t ival) {
+    llvm::Constant *v = LLVMUInt8(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt8Vector(const uint8_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt8(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt16Vector(int16_t ival) {
+    llvm::Constant *v = LLVMInt16(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMInt16Vector(const int16_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMInt16(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt16Vector(uint16_t ival) {
+    llvm::Constant *v = LLVMUInt16(ival);
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(v);
+    return llvm::ConstantVector::get(vals);
+}
+
+
+llvm::Constant *
+LLVMUInt16Vector(const uint16_t *ivec) {
+    std::vector<llvm::Constant *> vals;
+    for (int i = 0; i < g->target.vectorWidth; ++i)
+        vals.push_back(LLVMUInt16(ivec[i]));
+    return llvm::ConstantVector::get(vals);
+}
+
+
 llvm::Constant *
 LLVMInt32Vector(int32_t ival) {
    llvm::Constant *v = LLVMInt32(ival);
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -53,28 +53,39 @@ struct LLVMTypes {
    static LLVM_TYPE_CONST llvm::Type *VoidType;
    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
    static LLVM_TYPE_CONST llvm::Type *BoolType;
+
    static LLVM_TYPE_CONST llvm::Type *Int8Type;
    static LLVM_TYPE_CONST llvm::Type *Int16Type;
    static LLVM_TYPE_CONST llvm::Type *Int32Type;
-    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
    static LLVM_TYPE_CONST llvm::Type *Int64Type;
-    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
    static LLVM_TYPE_CONST llvm::Type *FloatType;
-    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoubleType;
+
+    static LLVM_TYPE_CONST llvm::Type *Int8PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int16PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int32PointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int64PointerType;
+    static LLVM_TYPE_CONST llvm::Type *FloatPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoublePointerType;

    static LLVM_TYPE_CONST llvm::VectorType *MaskType;
+
    static LLVM_TYPE_CONST llvm::VectorType *BoolVectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int1VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int8VectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *Int16VectorType;
    static LLVM_TYPE_CONST llvm::VectorType *Int32VectorType;
-    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *Int64VectorType;
-    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *FloatVectorType;
-    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
    static LLVM_TYPE_CONST llvm::VectorType *DoubleVectorType;
+
+    static LLVM_TYPE_CONST llvm::Type *Int8VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int16VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int32VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *Int64VectorPointerType;
+    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
+
    static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
 };

@@ -89,6 +100,14 @@ extern llvm::Constant *LLVMTrue, *LLVMFalse;
 */
 extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target);

+/** Returns an LLVM i8 constant of the given value */
+extern llvm::ConstantInt *LLVMInt8(int8_t i);
+/** Returns an LLVM i8 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt8(uint8_t i);
+/** Returns an LLVM i16 constant of the given value */
+extern llvm::ConstantInt *LLVMInt16(int16_t i);
+/** Returns an LLVM i16 constant of the given value */
+extern llvm::ConstantInt *LLVMUInt16(uint16_t i);
 /** Returns an LLVM i32 constant of the given value */
 extern llvm::ConstantInt *LLVMInt32(int32_t i);
 /** Returns an LLVM i32 constant of the given value */
@@ -105,18 +124,35 @@ extern llvm::Constant *LLVMDouble(double f);
 /** Returns an LLVM boolean vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMBoolVector(bool v);
+
+/** Returns an LLVM i8 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt8Vector(int8_t i);
+/** Returns an LLVM i8 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt8Vector(uint8_t i);
+
+/** Returns an LLVM i16 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMInt16Vector(int16_t i);
+/** Returns an LLVM i16 vector constant of the given value smeared
+    across all elements */
+extern llvm::Constant *LLVMUInt16Vector(uint16_t i);
+
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt32Vector(int32_t i);
 /** Returns an LLVM i32 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt32Vector(uint32_t i);
+
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMInt64Vector(int64_t i);
 /** Returns an LLVM i64 vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMUInt64Vector(uint64_t i);
+
 /** Returns an LLVM float vector constant of the given value smeared
    across all elements */
 extern llvm::Constant *LLVMFloatVector(float f);
@@ -127,18 +163,35 @@ extern llvm::Constant *LLVMDoubleVector(double f);
 /** Returns an LLVM boolean vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMBoolVector(const bool *v);
+
+/** Returns an LLVM i8 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt8Vector(const int8_t *i);
+/** Returns an LLVM i8 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt8Vector(const uint8_t *i);
+
+/** Returns an LLVM i16 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMInt16Vector(const int16_t *i);
+/** Returns an LLVM i16 vector based on the given array of values.
+    The array should have g->target.vectorWidth elements. */
+extern llvm::Constant *LLVMUInt16Vector(const uint16_t *i);
+
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt32Vector(const int32_t *i);
 /** Returns an LLVM i32 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i);
+
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMInt64Vector(const int64_t *i);
 /** Returns an LLVM i64 vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i);
+
 /** Returns an LLVM float vector based on the given array of values.
    The array should have g->target.vectorWidth elements. */
 extern llvm::Constant *LLVMFloatVector(const float *f);
--- a/opt.cpp
+++ b/opt.cpp
@@ -409,7 +409,6 @@ IntrinsicsOpt::IntrinsicsOpt()
        llvm::Intrinsic::getDeclaration(m->module, llvm::Intrinsic::x86_sse_movmsk_ps);
    maskInstructions.push_back(sseMovmsk);
    maskInstructions.push_back(m->module->getFunction("llvm.x86.avx.movmsk.ps"));
-    maskInstructions.push_back(m->module->getFunction("llvm.x86.mic.mask16.to.int"));
    maskInstructions.push_back(m->module->getFunction("__movmsk"));

    // And all of the blend instructions
@@ -418,8 +417,6 @@ IntrinsicsOpt::IntrinsicsOpt()
        0xf, 0, 1, 2));
    blendInstructions.push_back(BlendInstruction(
        m->module->getFunction("llvm.x86.avx.blendvps"), 0xff, 0, 1, 2));
-    blendInstructions.push_back(BlendInstruction(
-        m->module->getFunction("llvm.x86.mic.blend.ps"), 0xffff, 1, 2, 0));
 }


@@ -499,8 +496,8 @@ bool
 IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (!callInst)
            continue;

@@ -512,7 +509,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {

            // If the values are the same, then no need to blend..
            if (v[0] == v[1]) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                           iter, v[0]);
                modifiedAny = true;
                goto restart;
            }
@@ -524,12 +522,14 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
            // otherwise the result is undefined and any value is fine,
            // ergo the defined one is an acceptable result.)
            if (lIsUndef(v[0])) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[1]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                           iter, v[1]);
                modifiedAny = true;
                goto restart;
            }
            if (lIsUndef(v[1])) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, v[0]);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                           iter, v[0]);
                modifiedAny = true;
                goto restart;
            }
@@ -544,7 +544,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                value = v[1];

            if (value != NULL) {
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(), 
+                                           iter, value);
                modifiedAny = true;
                goto restart;
            }
@@ -557,7 +558,8 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                // with the corresponding integer mask from its elements
                // high bits.
                llvm::Value *value = LLVMInt32(mask);
-                llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, value);
+                llvm::ReplaceInstWithValue(iter->getParent()->getInstList(),
+                                           iter, value);
                modifiedAny = true;
                goto restart;
            }
@@ -653,10 +655,18 @@ lSizeOfIfKnown(const llvm::Type *type, uint64_t *size) {
        *size = 1;
        return true;
    }
+    if (type == LLVMTypes::Int8VectorType) {
+        *size = g->target.vectorWidth * 1;
+        return true;
+    }
    else if (type == LLVMTypes::Int16Type) {
        *size = 2;
        return true;
    }
+    if (type == LLVMTypes::Int16VectorType) {
+        *size = g->target.vectorWidth * 2;
+        return true;
+    }
    else if (type == LLVMTypes::FloatType || type == LLVMTypes::Int32Type) {
        *size = 4;
        return true;
@@ -978,33 +988,53 @@ lGetPtrAndOffsets(llvm::Value *ptrs, llvm::Value **basePtr,
 }


+struct GSInfo {
+    GSInfo(const char *pgFuncName, const char *pgboFuncName, bool ig, int es) 
+        : isGather(ig), elementSize(es) {
+        func = m->module->getFunction(pgFuncName);
+        baseOffsetsFunc = m->module->getFunction(pgboFuncName);
+    }
+    llvm::Function *func;
+    llvm::Function *baseOffsetsFunc;
+    const bool isGather;
+    const int elementSize;
+};
+
+
 bool
 GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_32");
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_64");
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_32");
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_64");
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+    GSInfo gsFuncs[] = {
+        GSInfo("__pseudo_gather_8",  "__pseudo_gather_base_offsets_8",  true, 1),
+        GSInfo("__pseudo_gather_16", "__pseudo_gather_base_offsets_16", true, 2),
+        GSInfo("__pseudo_gather_32", "__pseudo_gather_base_offsets_32", true, 4),
+        GSInfo("__pseudo_gather_64", "__pseudo_gather_base_offsets_64", true, 8),
+        GSInfo("__pseudo_scatter_8",  "__pseudo_scatter_base_offsets_8",  false, 1),
+        GSInfo("__pseudo_scatter_16", "__pseudo_scatter_base_offsets_16", false, 2),
+        GSInfo("__pseudo_scatter_32", "__pseudo_scatter_base_offsets_32", false, 4),
+        GSInfo("__pseudo_scatter_64", "__pseudo_scatter_base_offsets_64", false, 8),
+    };
+    int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
+    for (int i = 0; i < numGSFuncs; ++i)
+        assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL);

    bool modifiedAny = false;
 restart:
    // Iterate through all of the instructions in the basic block.
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        // If we don't have a call to one of the
        // __pseudo_{gather,scatter}_* functions, then just go on to the
        // next instruction.
-        if (!callInst ||
-            (callInst->getCalledFunction() != gather32Func &&
-             callInst->getCalledFunction() != gather64Func &&
-             callInst->getCalledFunction() != scatter32Func &&
-             callInst->getCalledFunction() != scatter64Func))
+        if (callInst == NULL)
+            continue;
+        GSInfo *info = NULL;
+        for (int i = 0; i < numGSFuncs; ++i)
+            if (callInst->getCalledFunction() == gsFuncs[i].func) {
+                info = &gsFuncs[i];
+                break;
+            }
+        if (info == NULL)
            continue;
-
-        bool isGather = (callInst->getCalledFunction() == gather32Func ||
-                         callInst->getCalledFunction() == gather64Func);
-        bool is32 = (callInst->getCalledFunction() == gather32Func ||
-                     callInst->getCalledFunction() == scatter32Func);

        // Transform the array of pointers to a single base pointer and an
        // array of int32 offsets.  (All the hard work is done by
@@ -1012,19 +1042,15 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
        llvm::Value *ptrs = callInst->getArgOperand(0);
        llvm::Value *basePtr = NULL;
        llvm::Value *offsetVector = lGetPtrAndOffsets(ptrs, &basePtr, callInst, 
-                                                      is32 ? 4 : 8);
+                                                      info->elementSize);
        // Cast the base pointer to a void *, since that's what the
        // __pseudo_*_base_offsets_* functions want.
-        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType, "base2void", 
-                                        callInst);
+        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType,
+                                        "base2void", callInst);
        lCopyMetadata(basePtr, callInst);

-        if (isGather) {
+        if (info->isGather) {
            llvm::Value *mask = callInst->getArgOperand(1);
-            llvm::Function *gFunc = 
-                m->module->getFunction(is32 ? "__pseudo_gather_base_offsets_32" :
-                                              "__pseudo_gather_base_offsets_64");
-            assert(gFunc != NULL);

            // Generate a new function call to the next pseudo gather
            // base+offsets instruction.  Note that we're passing a NULL
@@ -1035,11 +1061,12 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, newArgArray, "newgather", 
-                                       (llvm::Instruction *)NULL);
+                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray,
+                                       "newgather", (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[3], "newgather");
+                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], &newArgs[3],
+                                       "newgather");
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1047,10 +1074,6 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
        else {
            llvm::Value *mask = callInst->getArgOperand(2);
            llvm::Value *rvalue = callInst->getArgOperand(1);
-            llvm::Function *gFunc = 
-                m->module->getFunction(is32 ? "__pseudo_scatter_base_offsets_32" :
-                                              "__pseudo_scatter_base_offsets_64");
-            assert(gFunc);

            // Generate a new function call to the next pseudo scatter
            // base+offsets instruction.  See above for why passing NULL
@@ -1059,11 +1082,12 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
            llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, newArgArray, "", 
+                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray, "", 
                                       (llvm::Instruction *)NULL);
 #else
            llvm::Instruction *newCall = 
-                llvm::CallInst::Create(gFunc, &newArgs[0], &newArgs[4]);
+                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], 
+                                       &newArgs[4]);
 #endif
            lCopyMetadata(newCall, callInst);
            llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1105,28 +1129,53 @@ char MaskedStoreOptPass::ID = 0;
 llvm::RegisterPass<MaskedStoreOptPass> mss("masked-store-scalarize",
                                           "Masked Store Scalarize Pass");

+struct MSInfo {
+    MSInfo(const char *name, const int a) 
+        : align(a) {
+        func = m->module->getFunction(name);
+        assert(func != NULL);
+    }
+    llvm::Function *func;
+    const int align;
+};
+        
+
 bool
 MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *pms32Func = m->module->getFunction("__pseudo_masked_store_32");
-    llvm::Function *pms64Func = m->module->getFunction("__pseudo_masked_store_64");
-    llvm::Function *msb32Func = m->module->getFunction("__masked_store_blend_32");
-    llvm::Function *msb64Func = m->module->getFunction("__masked_store_blend_64");
-    llvm::Function *ms32Func = m->module->getFunction("__masked_store_32");
-    llvm::Function *ms64Func = m->module->getFunction("__masked_store_64");
+    MSInfo msInfo[] = {
+        MSInfo("__pseudo_masked_store_8",  1),
+        MSInfo("__pseudo_masked_store_16", 2),
+        MSInfo("__pseudo_masked_store_32", 4),
+        MSInfo("__pseudo_masked_store_64", 8),
+        MSInfo("__masked_store_blend_8",  1),
+        MSInfo("__masked_store_blend_16", 2),
+        MSInfo("__masked_store_blend_32", 4),
+        MSInfo("__masked_store_blend_64", 8),
+        MSInfo("__masked_store_8",  1),
+        MSInfo("__masked_store_16", 2),
+        MSInfo("__masked_store_32", 4),
+        MSInfo("__masked_store_64", 8)
+    };

    bool modifiedAny = false;
 restart:
    // Iterate over all of the instructions to look for one of the various
    // masked store functions
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
        if (!callInst)
            continue;

        llvm::Function *called = callInst->getCalledFunction();
-        if (called != pms32Func && called != pms64Func &&
-            called != msb32Func && called != msb64Func &&
-            called != ms32Func  && called != ms64Func)
+        int nMSFuncs = sizeof(msInfo) / sizeof(msInfo[0]);
+        MSInfo *info = NULL;
+        for (int i = 0; i < nMSFuncs; ++i) {
+            if (called == msInfo[i].func) {
+                info = &msInfo[i];
+                break;
+            }
+        }
+        if (info == NULL)
            continue;

        // Got one; grab the operands
@@ -1150,15 +1199,12 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            LLVM_TYPE_CONST llvm::Type *rvalueType = rvalue->getType();
            LLVM_TYPE_CONST llvm::Type *ptrType = 
                llvm::PointerType::get(rvalueType, 0);
-            // Need to update this when int8/int16 are added
-            int align = (called == pms32Func || called == pms64Func ||
-                         called == msb32Func) ? 4 : 8;

            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
            lCopyMetadata(lvalue, callInst);
            llvm::Instruction *store = 
                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                    align);
+                                    info->align);
            lCopyMetadata(store, callInst);
            llvm::ReplaceInstWithInst(callInst, store);

@@ -1180,9 +1226,9 @@ CreateMaskedStoreOptPass() {
 // LowerMaskedStorePass

 /** When the front-end needs to do a masked store, it emits a
-    __pseudo_masked_store_{32,64} call as a placeholder.  This pass lowers
-    these calls to either __masked_store_{32,64} or
-    __masked_store_blend_{32,64} calls.
+    __pseudo_masked_store_{8,16,32,64} call as a placeholder.  This pass
+    lowers these calls to either __masked_store_{8,16,32,64} or
+    __masked_store_blend_{8,16,32,64} calls.
  */
 class LowerMaskedStorePass : public llvm::BasicBlockPass {
 public:
@@ -1227,45 +1273,51 @@ lIsStackVariablePointer(llvm::Value *lvalue) {
 }


-/** Utilty routine to figure out which masked store function to use.  The
-    blend parameter indicates if we want the blending version, is32
-    indicates if the element size is 32 bits.
- */
-static const char *
-lMaskedStoreName(bool blend, bool is32) {
-    if (blend) {
-        if (is32)
-            return "__masked_store_blend_32";
-        else
-            return "__masked_store_blend_64";
+struct LMSInfo {
+    LMSInfo(const char *pname, const char *bname, const char *msname) {
+        pseudoFunc = m->module->getFunction(pname);
+        blendFunc = m->module->getFunction(bname);
+        maskedStoreFunc = m->module->getFunction(msname);
+        assert(pseudoFunc != NULL && blendFunc != NULL && 
+               maskedStoreFunc != NULL);
    }
-    else {
-        if (is32)
-            return "__masked_store_32";
-        else
-            return "__masked_store_64";
-    }
-}
+    llvm::Function *pseudoFunc;
+    llvm::Function *blendFunc;
+    llvm::Function *maskedStoreFunc;
+};


 bool
 LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *maskedStore32Func = m->module->getFunction("__pseudo_masked_store_32");
-    llvm::Function *maskedStore64Func = m->module->getFunction("__pseudo_masked_store_64");
-    assert(maskedStore32Func && maskedStore64Func);
+    LMSInfo msInfo[] = {
+        LMSInfo("__pseudo_masked_store_8", "__masked_store_blend_8", 
+                "__masked_store_8"),
+        LMSInfo("__pseudo_masked_store_16", "__masked_store_blend_16", 
+                "__masked_store_16"),
+        LMSInfo("__pseudo_masked_store_32", "__masked_store_blend_32", 
+                "__masked_store_32"),
+        LMSInfo("__pseudo_masked_store_64", "__masked_store_blend_64", 
+                "__masked_store_64")
+    };

    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Iterate through all of the instructions and look for
        // __pseudo_masked_store_* calls.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst ||
-            (callInst->getCalledFunction() != maskedStore32Func &&
-             callInst->getCalledFunction() != maskedStore64Func))
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
+        if (callInst == NULL)
+            continue;
+        LMSInfo *info = NULL;
+        for (unsigned int i = 0; i < sizeof(msInfo) / sizeof(msInfo[0]); ++i) {
+            if (callInst->getCalledFunction() == msInfo[i].pseudoFunc) {
+                info = &msInfo[i];
+                break;
+            }
+        }
+        if (info == NULL)
            continue;

-        bool is32 = (callInst->getCalledFunction() == maskedStore32Func);
        llvm::Value *lvalue = callInst->getArgOperand(0);
        llvm::Value *rvalue  = callInst->getArgOperand(1);
        llvm::Value *mask = callInst->getArgOperand(2);
@@ -1282,8 +1334,7 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {

        // Generate the call to the appropriate masked store function and
        // replace the __pseudo_* one with it.
-        llvm::Function *fms = m->module->getFunction(lMaskedStoreName(doBlend, is32));
-        assert(fms);
+        llvm::Function *fms = doBlend ? info->blendFunc : info->maskedStoreFunc;
        llvm::Value *args[3] = { lvalue, rvalue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
        llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[3]);
@@ -1872,37 +1923,94 @@ lVectorIsLinear(llvm::Value *v[ISPC_MAX_NVEC], int stride) {
 }


+struct GatherImpInfo {
+    GatherImpInfo(const char *pName, const char *lbName, const char *lmName,
+                  int a) 
+        : align(a) {
+        pseudoFunc = m->module->getFunction(pName);
+        loadBroadcastFunc = m->module->getFunction(lbName);
+        loadMaskedFunc = m->module->getFunction(lmName);
+
+        assert(pseudoFunc != NULL && loadBroadcastFunc != NULL &&
+               loadMaskedFunc != NULL);
+    }
+    llvm::Function *pseudoFunc;
+    llvm::Function *loadBroadcastFunc;
+    llvm::Function *loadMaskedFunc;
+    const int align;
+};
+
+
+struct ScatterImpInfo {
+    ScatterImpInfo(const char *pName, const char *msName, 
+                   LLVM_TYPE_CONST llvm::Type *vpt, int a)
+        : align(a) {
+        pseudoFunc = m->module->getFunction(pName);
+        maskedStoreFunc = m->module->getFunction(msName);
+        vecPtrType = vpt;
+        assert(pseudoFunc != NULL && maskedStoreFunc != NULL);
+    }
+    llvm::Function *pseudoFunc;
+    llvm::Function *maskedStoreFunc;
+    LLVM_TYPE_CONST llvm::Type *vecPtrType;
+    const int align;
+};
+    
+
 bool
 GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+    GatherImpInfo gInfo[] = {
+        GatherImpInfo("__pseudo_gather_base_offsets_8", "__load_and_broadcast_8",
+                      "__load_masked_8", 1),
+        GatherImpInfo("__pseudo_gather_base_offsets_16", "__load_and_broadcast_16",
+                      "__load_masked_16", 2),
+        GatherImpInfo("__pseudo_gather_base_offsets_32", "__load_and_broadcast_32",
+                      "__load_masked_32", 4),
+        GatherImpInfo("__pseudo_gather_base_offsets_64", "__load_and_broadcast_64",
+                      "__load_masked_64", 8)
+    };
+    ScatterImpInfo sInfo[] = {
+        ScatterImpInfo("__pseudo_scatter_base_offsets_8",  "__pseudo_masked_store_8", 
+                       LLVMTypes::Int8VectorPointerType, 1),
+        ScatterImpInfo("__pseudo_scatter_base_offsets_16", "__pseudo_masked_store_16",
+                       LLVMTypes::Int16VectorPointerType, 2),
+        ScatterImpInfo("__pseudo_scatter_base_offsets_32", "__pseudo_masked_store_32",
+                       LLVMTypes::Int32VectorPointerType, 4),
+        ScatterImpInfo("__pseudo_scatter_base_offsets_64", "__pseudo_masked_store_64",
+                       LLVMTypes::Int64VectorPointerType, 8)
+    };

    bool modifiedAny = false;

 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Iterate over all of the instructions and look for calls to
        // __pseudo_*_base_offsets_* calls.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || 
-            (callInst->getCalledFunction() != gather32Func &&
-             callInst->getCalledFunction() != gather64Func &&
-             callInst->getCalledFunction() != scatter32Func &&
-             callInst->getCalledFunction() != scatter64Func))
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
+        if (callInst == NULL)
+            continue;
+        llvm::Function *calledFunc = callInst->getCalledFunction();
+        GatherImpInfo *gatherInfo = NULL;
+        ScatterImpInfo *scatterInfo = NULL;
+        for (unsigned int i = 0; i < sizeof(gInfo) / sizeof(gInfo[0]); ++i) {
+            if (calledFunc == gInfo[i].pseudoFunc) {
+                gatherInfo = &gInfo[i];
+                break;
+            }
+        }
+        for (unsigned int i = 0; i < sizeof(sInfo) / sizeof(sInfo[0]); ++i) {
+            if (calledFunc == sInfo[i].pseudoFunc) {
+                scatterInfo = &sInfo[i];
+                break;
+            }
+        }
+        if (gatherInfo == NULL && scatterInfo == NULL)
            continue;

        SourcePos pos;
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
        assert(ok);     

-        bool isGather = (callInst->getCalledFunction() == gather32Func ||
-                         callInst->getCalledFunction() == gather64Func);
-        bool is32 = (callInst->getCalledFunction() == gather32Func ||
-                     callInst->getCalledFunction() == scatter32Func);
-
        // Get the actual base pointer; note that it comes into the gather
        // or scatter function bitcast to an i8 *, so we need to work back
        // to get the pointer as the original type.
@@ -1921,7 +2029,7 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        if (!lScalarizeVector(callInst->getArgOperand(1), offsetElements))
            continue;

-        llvm::Value *mask = callInst->getArgOperand(isGather ? 2 : 3);
+        llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);

        if (lVectorValuesAllEqual(offsetElements)) {
            // If all the offsets are equal, then compute the single
@@ -1929,14 +2037,15 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            // (arbitrarily).
            llvm::Value *indices[1] = { offsetElements[0] };
            llvm::Value *basei8 =
-                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
+                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType,
+                                      "base2i8", callInst);
            lCopyMetadata(basei8, callInst);
            llvm::Value *ptr = 
                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
                                                "ptr", callInst);
            lCopyMetadata(ptr, callInst);

-            if (isGather) {
+            if (gatherInfo != NULL) {
                // A gather with everyone going to the same location is
                // handled as a scalar load and broadcast across the lanes.
                // Note that we do still have to pass the mask to the
@@ -1944,20 +2053,16 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                // access memory if the mask is all off (the location may
                // be invalid in that case).
                Debug(pos, "Transformed gather to scalar load and broadcast!");
-                llvm::Function *loadBroadcast = 
-                    m->module->getFunction(is32 ? "__load_and_broadcast_32" :
-                                                  "__load_and_broadcast_64");
-                assert(loadBroadcast);
                llvm::Value *args[2] = { ptr, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> newArgArray(&args[0], &args[2]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadBroadcast, newArgArray,
+                    llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, newArgArray,
                                           "load_broadcast", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadBroadcast, &args[0], &args[2],
-                                           "load_broadcast");
+                    llvm::CallInst::Create(gatherInfo->loadBroadcastFunc, &args[0], 
+                                           &args[2], "load_broadcast");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
@@ -1977,8 +2082,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                ptr = new llvm::BitCastInst(ptr, llvm::PointerType::get(first->getType(), 0),
                                            "ptr2rvalue_type", callInst);
                lCopyMetadata(ptr, callInst);
-                llvm::Instruction *sinst = 
-                    new llvm::StoreInst(first, ptr, false, is32 ? 4 : 8 /* align */);
+                llvm::Instruction *sinst = new llvm::StoreInst(first, ptr, false, 
+                                                               scatterInfo->align);
                lCopyMetadata(sinst, callInst);
                llvm::ReplaceInstWithInst(callInst, sinst);
            }
@@ -1987,7 +2092,8 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
            goto restart;
        }

-        if (lVectorIsLinear(offsetElements, is32 ? 4 : 8)) {
+        int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
+        if (lVectorIsLinear(offsetElements, step)) {
            // We have a linear sequence of memory locations being accessed
            // starting with the location given by the offset from
            // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
@@ -2003,53 +2109,38 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
                                                "ptr", callInst);
            lCopyMetadata(ptr, callInst);

-            if (isGather) {
+            if (gatherInfo != NULL) {
                Debug(pos, "Transformed gather to unaligned vector load!");
-                // FIXME: make this an aligned load when possible..
-                // FIXME: are there lurking potential bugs when e.g. the
-                // last few entries of the mask are off and the load ends
-                // up straddling a page boundary?
-                llvm::Function *loadMasked = 
-                    m->module->getFunction(is32 ? "__load_masked_32" : "__load_masked_64");
-                assert(loadMasked);
-
                llvm::Value *args[2] = { ptr, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[2]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadMasked, argArray, "load_masked",
-                                           (llvm::Instruction *)NULL);
+                    llvm::CallInst::Create(gatherInfo->loadMaskedFunc, argArray, 
+                                           "load_masked", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(loadMasked, &args[0], &args[2], "load_masked");
+                    llvm::CallInst::Create(gatherInfo->loadMaskedFunc, &args[0],
+                                           &args[2], "load_masked");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
            }
            else {
                Debug(pos, "Transformed scatter to unaligned vector store!");
-                // FIXME: make this an aligned store when possible.  Need
-                // to work through the messiness of issuing a pseudo store
-                // here.
                llvm::Value *rvalue = callInst->getArgOperand(2);
-
-                llvm::Function *storeMasked = 
-                    m->module->getFunction(is32 ? "__pseudo_masked_store_32" :
-                                                  "__pseudo_masked_store_64");
-                assert(storeMasked);
-                LLVM_TYPE_CONST llvm::Type *vecPtrType = is32 ?
-                    LLVMTypes::Int32VectorPointerType : LLVMTypes::Int64VectorPointerType;
-                ptr = new llvm::BitCastInst(ptr, vecPtrType, "ptrcast", callInst);
+                ptr = new llvm::BitCastInst(ptr, scatterInfo->vecPtrType, "ptrcast", 
+                                            callInst);

                llvm::Value *args[3] = { ptr, rvalue, mask };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn)
                llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(storeMasked, argArray, "",
-                                           (llvm::Instruction *)NULL);
+                    llvm::CallInst::Create(scatterInfo->maskedStoreFunc, argArray,
+                                           "", (llvm::Instruction *)NULL);
 #else
                llvm::Instruction *newCall = 
-                    llvm::CallInst::Create(storeMasked, &args[0], &args[3], "");
+                    llvm::CallInst::Create(scatterInfo->maskedStoreFunc,
+                                           &args[0], &args[3], "");
 #endif
                lCopyMetadata(newCall, callInst);
                llvm::ReplaceInstWithInst(callInst, newCall);
@@ -2097,31 +2188,50 @@ char LowerGSPass::ID = 0;
 llvm::RegisterPass<LowerGSPass> lgs("lower-gs",
                                    "Lower Gather/Scatter Pass");

+struct LowerGSInfo {
+    LowerGSInfo(const char *pName, const char *aName, bool ig)
+        : isGather(ig) {
+        pseudoFunc = m->module->getFunction(pName);
+        actualFunc = m->module->getFunction(aName);
+        assert(pseudoFunc != NULL && actualFunc != NULL);
+    }
+    llvm::Function *pseudoFunc;
+    llvm::Function *actualFunc;
+    const bool isGather;
+};
+
+
 bool
 LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *gather32Func = m->module->getFunction("__pseudo_gather_base_offsets_32");
-    llvm::Function *gather64Func = m->module->getFunction("__pseudo_gather_base_offsets_64");
-    llvm::Function *scatter32Func = m->module->getFunction("__pseudo_scatter_base_offsets_32");
-    llvm::Function *scatter64Func = m->module->getFunction("__pseudo_scatter_base_offsets_64");
-    assert(gather32Func && gather64Func && scatter32Func && scatter64Func);
+    LowerGSInfo lgsInfo[] = {
+        LowerGSInfo("__pseudo_gather_base_offsets_8",  "__gather_base_offsets_i8",  true),
+        LowerGSInfo("__pseudo_gather_base_offsets_16", "__gather_base_offsets_i16", true),
+        LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
+        LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
+        LowerGSInfo("__pseudo_scatter_base_offsets_8",  "__scatter_base_offsets_i8",  false),
+        LowerGSInfo("__pseudo_scatter_base_offsets_16", "__scatter_base_offsets_i16", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false)
+    };

    bool modifiedAny = false;
 restart:
-    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
+    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // Loop over the instructions and find calls to the
        // __pseudo_*_base_offsets_* functions.
-        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || 
-            (callInst->getCalledFunction() != gather32Func &&
-             callInst->getCalledFunction() != gather64Func &&
-             callInst->getCalledFunction() != scatter32Func &&
-             callInst->getCalledFunction() != scatter64Func))
+        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*iter);
+        if (callInst == NULL)
+            continue;
+        llvm::Function *calledFunc = callInst->getCalledFunction();
+        LowerGSInfo *info = NULL;
+        for (unsigned int i = 0; i < sizeof(lgsInfo) / sizeof(lgsInfo[0]); ++i) {
+            if (calledFunc == lgsInfo[i].pseudoFunc) {
+                info = &lgsInfo[i];
+                break;
+            }
+        }
+        if (info == NULL)
            continue;
-
-        bool isGather = (callInst->getCalledFunction() == gather32Func ||
-                         callInst->getCalledFunction() == gather64Func);
-        bool is32 = (callInst->getCalledFunction() == gather32Func ||
-                     callInst->getCalledFunction() == scatter32Func);

        // Get the source position from the metadata attached to the call
        // instruction so that we can issue PerformanceWarning()s below.
@@ -2129,20 +2239,11 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        bool ok = lGetSourcePosFromMetadata(callInst, &pos);
        assert(ok);     

-        if (isGather) {
-            llvm::Function *gFunc = m->module->getFunction(is32 ? "__gather_base_offsets_i32" :
-                                                                  "__gather_base_offsets_i64");
-            assert(gFunc);
-            callInst->setCalledFunction(gFunc);
+        callInst->setCalledFunction(info->actualFunc);
+        if (info->isGather)
            PerformanceWarning(pos, "Gather required to compute value in expression.");
-        }
-        else {
-            llvm::Function *sFunc = m->module->getFunction(is32 ? "__scatter_base_offsets_i32" :
-                                                                  "__scatter_base_offsets_i64");
-            assert(sFunc);
-            callInst->setCalledFunction(sFunc);
+        else
            PerformanceWarning(pos, "Scatter required for storing value.");
-        }
        modifiedAny = true;
        goto restart;
    }
@@ -2286,25 +2387,41 @@ char MakeInternalFuncsStaticPass::ID = 0;
 llvm::RegisterPass<MakeInternalFuncsStaticPass> 
  mifsp("make-internal-funcs-static", "Make Internal Funcs Static Pass");

+
 bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
    const char *names[] = {
-        "__do_print", "__gather_base_offsets_i32", "__gather_base_offsets_i64",
-        "__gather_elt_32", "__gather_elt_64", "__load_and_broadcast_32", 
-        "__load_and_broadcast_64", "__load_masked_32", "__load_masked_64",
-        "__masked_store_32", "__masked_store_64", "__masked_store_blend_32",
-        "__masked_store_blend_64", "__packed_load_active", "__packed_store_active",
-        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64", "__scatter_elt_32",
-        "__scatter_elt_64", };
+        "__do_print",
+        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
+        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
+        "__gather_elt_8", "__gather_elt_16", 
+        "__gather_elt_32", "__gather_elt_64", 
+        "__load_and_broadcast_8", "__load_and_broadcast_16",
+        "__load_and_broadcast_32", "__load_and_broadcast_64",
+        "__load_masked_8", "__load_masked_16",
+        "__load_masked_32", "__load_masked_64",
+        "__masked_store_8", "__masked_store_16",
+        "__masked_store_32", "__masked_store_64",
+        "__masked_store_blend_8", "__masked_store_blend_16",
+        "__masked_store_blend_32", "__masked_store_blend_64",
+        "__packed_load_active", "__packed_store_active",
+        "__scatter_base_offsets_i8", "__scatter_base_offsets_i16",
+        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64",
+        "__scatter_elt_8", "__scatter_elt_16", 
+        "__scatter_elt_32", "__scatter_elt_64", 
+    };

+    bool modifiedAny = false;
    int count = sizeof(names) / sizeof(names[0]);
    for (int i = 0; i < count; ++i) {
        llvm::Function *f = m->module->getFunction(names[i]);
-        if (f != NULL)
+        if (f != NULL) {
            f->setLinkage(llvm::GlobalValue::PrivateLinkage);
+            modifiedAny = true;
+        }
    }

-    return true;
+    return modifiedAny;
 }


--- a/parse.yy
+++ b/parse.yy
@@ -102,15 +102,16 @@ static const char *lBuiltinTokens[] = {
    "bool", "break", "case", "cbreak", "ccontinue", "cdo", "cfor", "char", 
    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
    "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
-    "inline", "int", "int32", "int64", "launch", "print", "reference", "return",
+    "inline", "int", "int8", "int16", "int32", "int64", "launch", "print",
+    "reference", "return",
    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
    "unsigned", "varying", "void", "while", NULL 
 };

 static const char *lParamListTokens[] = {
    "bool", "char", "const", "double", "enum", "false", "float", "int",
-    "int32", "int64", "reference", "struct", "true", "uniform", "unsigned",
-    "varying", "void", NULL 
+    "int8", "int16", "int32", "int64", "reference", "struct", "true",
+     "uniform", "unsigned", "varying", "void", NULL 
 };
    
 %}
@@ -154,7 +155,7 @@ static const char *lParamListTokens[] = {
 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
 %token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
-%token TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
+%token TOKEN_INT8 TOKEN_INT16 TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
 %token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE

 %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
@@ -587,7 +588,8 @@ type_specifier
 atomic_var_type_specifier
    : TOKEN_VOID { $$ = AtomicType::Void; }
    | TOKEN_BOOL { $$ = AtomicType::VaryingBool; }
-/*  | TOKEN_CHAR { UNIMPLEMENTED; } */
+    | TOKEN_INT8 { $$ = AtomicType::VaryingInt8; }
+    | TOKEN_INT16 { $$ = AtomicType::VaryingInt16; }
    | TOKEN_INT { $$ = AtomicType::VaryingInt32; }
    | TOKEN_FLOAT { $$ = AtomicType::VaryingFloat; }
    | TOKEN_DOUBLE { $$ = AtomicType::VaryingDouble; }
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -41,7 +41,6 @@

 stdlib_core(8)
 packed_load_and_store(8)
-int8_16(8)
 int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -539,55 +538,14 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinli
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

-define <8 x i32> @__load_and_broadcast_32(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<8 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; TODO: make sure this becomes a vbroadcast...
-  %ptr = bitcast i8 * %0 to i32 *
-  %val = load i32 * %ptr
-
-  %ret0 = insertelement <8 x i32> undef, i32 %val, i32 0
-  %ret1 = insertelement <8 x i32> %ret0, i32 %val, i32 1
-  %ret2 = insertelement <8 x i32> %ret1, i32 %val, i32 2
-  %ret3 = insertelement <8 x i32> %ret2, i32 %val, i32 3
-  %ret4 = insertelement <8 x i32> %ret3, i32 %val, i32 4
-  %ret5 = insertelement <8 x i32> %ret4, i32 %val, i32 5
-  %ret6 = insertelement <8 x i32> %ret5, i32 %val, i32 6
-  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
-  ret <8 x i32> %ret7
-
-skip:
-  ret <8 x i32> undef
-}
-
-
-define <8 x i64> @__load_and_broadcast_64(i8 *, <8 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<8 x i32> %mask)
-  %any_on = icmp ne i32 %mm, 0
-  br i1 %any_on, label %load, label %skip
-
-load:
-  ; TODO: make sure this becomes a vbroadcast...
-  %ptr = bitcast i8 * %0 to i64 *
-  %val = load i64 * %ptr
-
-  %ret0 = insertelement <8 x i64> undef, i64 %val, i32 0
-  %ret1 = insertelement <8 x i64> %ret0, i64 %val, i32 1
-  %ret2 = insertelement <8 x i64> %ret1, i64 %val, i32 2
-  %ret3 = insertelement <8 x i64> %ret2, i64 %val, i32 3
-  %ret4 = insertelement <8 x i64> %ret3, i64 %val, i32 4
-  %ret5 = insertelement <8 x i64> %ret4, i64 %val, i32 5
-  %ret6 = insertelement <8 x i64> %ret5, i64 %val, i32 6
-  %ret7 = insertelement <8 x i64> %ret6, i64 %val, i32 7
-  ret <8 x i64> %ret3
-
-skip:
-  ret <8 x i64> undef
-}
+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
+load_and_broadcast(8, i32, 32)
+load_and_broadcast(8, i64, 64)

+; no masked load instruction for i8 and i16 types??
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)

 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8 *, <8 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
@@ -623,6 +581,12 @@ define <8 x i64> @__load_masked_64(i8 *, <8 x i32> %mask) nounwind alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

+; FIXME: there is no AVX instruction for these, but we could be clever
+; by packing the bits down and setting the last 3/4 or half, respectively,
+; of the mask to zero...  Not sure if this would be a win in the end
+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+
 ; note that mask is the 2nd parameter, not the 3rd one!!
 declare void @llvm.x86.avx.maskstore.ps.256(i8 *, <8 x float>, <8 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
@@ -660,13 +624,14 @@ define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
  ret void
 }

+masked_store_blend_8_16_by_8()

 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
                                                <8 x float>) nounwind readnone


 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
-                                           <8 x i32>) nounwind alwaysinline {
+                                     <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -36,7 +36,6 @@

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-int8_16(4)
 int64minmax(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -380,29 +379,23 @@ define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

-define void @__masked_store_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <4 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
-}
-
-define void @__masked_store_64(<4 x i64>* nocapture, <4 x i64>, <4 x i32>) nounwind alwaysinline {
-  per_lane(4, <4 x i32> %2, `
-      %ptr_ID = getelementptr <4 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <4 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
-}
+masked_store_blend_8_16_by_4()

+gen_masked_store(4, i8, 8)
+gen_masked_store(4, i16, 16)
+gen_masked_store(4, i32, 32)
+gen_masked_store(4, i64, 64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

+load_and_broadcast(4, i8, 8)
+load_and_broadcast(4, i16, 16)
 load_and_broadcast(4, i32, 32)
 load_and_broadcast(4, i64, 64)
+
+load_masked(4, i8,  8,  1)
+load_masked(4, i16, 16, 2)
 load_masked(4, i32, 32, 4)
 load_masked(4, i64, 64, 8)

@@ -411,7 +404,12 @@ load_masked(4, i64, 64, 8)

 ; define these with the macros from stdlib.m4

+gen_gather(4, i8)
+gen_gather(4, i16)
 gen_gather(4, i32)
 gen_gather(4, i64)
+
+gen_scatter(4, i8)
+gen_scatter(4, i16)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -38,7 +38,6 @@

 stdlib_core(8)
 packed_load_and_store(8)
-int8_16(8)
 int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -435,44 +434,29 @@ define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; masked store
-
-define void @__masked_store_32(<8 x i32>* nocapture, <8 x i32>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i32> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i32> %1, i32 LANE
-      store i32 %storeval_ID, i32 * %ptr_ID')
-  ret void
-}
-
-
-define void @__masked_store_64(<8 x i64>* nocapture, <8 x i64>,
-                               <8 x i32>) nounwind alwaysinline {
-  per_lane(8, <8 x i32> %2, `
-      ; compute address for this one
-      %ptr_ID = getelementptr <8 x i64> * %0, i32 0, i32 LANE
-      %storeval_ID = extractelement <8 x i64> %1, i32 LANE
-      store i64 %storeval_ID, i64 * %ptr_ID')
-  ret void
-}
-
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

+load_and_broadcast(8, i8, 8)
+load_and_broadcast(8, i16, 16)
 load_and_broadcast(8, i32, 32)
 load_and_broadcast(8, i64, 64)
+
+load_masked(8, i8,  8,  1)
+load_masked(8, i16, 16, 2)
 load_masked(8, i32, 32, 4)
 load_masked(8, i64, 64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather/scatter

+gen_gather(8, i8)
+gen_gather(8, i16)
 gen_gather(8, i32)
 gen_gather(8, i64)
+
+gen_scatter(8, i8)
+gen_scatter(8, i16)
 gen_scatter(8, i32)
 gen_scatter(8, i64)

@@ -619,6 +603,13 @@ define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysi
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

+gen_masked_store(8, i8, 8)
+gen_masked_store(8, i16, 16)
+gen_masked_store(8, i32, 32)
+gen_masked_store(8, i64, 64)
+
+masked_store_blend_8_16_by_8()
+
 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone

--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -85,6 +85,14 @@ static inline float broadcast(float v, uniform int i) {
    return __broadcast_float(v, i);
 }

+static inline int8 broadcast(int8 v, uniform int i) {
+    return __broadcast_int8(v, i);
+}
+
+static inline int16 broadcast(int16 v, uniform int i) {
+    return __broadcast_int16(v, i);
+}
+
 static inline int32 broadcast(int32 v, uniform int i) {
    return __broadcast_int32(v, i);
 }
@@ -101,6 +109,14 @@ static inline float rotate(float v, uniform int i) {
    return __rotate_float(v, i);
 }

+static inline int8 rotate(int8 v, uniform int i) {
+    return __rotate_int8(v, i);
+}
+
+static inline int16 rotate(int16 v, uniform int i) {
+    return __rotate_int16(v, i);
+}
+
 static inline int32 rotate(int32 v, uniform int i) {
    return __rotate_int32(v, i);
 }
@@ -117,6 +133,14 @@ static inline float shuffle(float v, int i) {
    return __shuffle_float(v, i);
 }

+static inline int8 shuffle(int8 v, int i) {
+    return __shuffle_int8(v, i);
+}
+
+static inline int16 shuffle(int16 v, int i) {
+    return __shuffle_int16(v, i);
+}
+
 static inline int32 shuffle(int32 v, int i) {
    return __shuffle_int32(v, i);
 }
@@ -133,6 +157,14 @@ static inline float shuffle(float v0, float v1, int i) {
    return __shuffle2_float(v0, v1, i);
 }

+static inline int8 shuffle(int8 v0, int8 v1, int i) {
+    return __shuffle2_int8(v0, v1, i);
+}
+
+static inline int16 shuffle(int16 v0, int16 v1, int i) {
+    return __shuffle2_int16(v0, v1, i);
+}
+
 static inline int32 shuffle(int32 v0, int32 v1, int i) {
    return __shuffle2_int32(v0, v1, i);
 }
@@ -150,11 +182,27 @@ static inline uniform float extract(float x, uniform int i) {
    return floatbits(__extract_int32((int)intbits(x), i));
 }

-static inline uniform int extract(int x, uniform int i) {
+static inline uniform int8 extract(int8 x, uniform int i) {
+    return __extract_int8(x, i);
+}
+
+static inline uniform unsigned int8 extract(unsigned int8 x, uniform int i) {
+    return __extract_int8(x, (unsigned int)i);
+}
+
+static inline uniform int16 extract(int16 x, uniform int i) {
+    return __extract_int16(x, i);
+}
+
+static inline uniform unsigned int16 extract(unsigned int16 x, uniform int i) {
+    return __extract_int16(x, (unsigned int)i);
+}
+
+static inline uniform int32 extract(int32 x, uniform int i) {
    return __extract_int32(x, i);
 }

-static inline uniform unsigned int extract(unsigned int x, uniform int i) {
+static inline uniform unsigned int32 extract(unsigned int32 x, uniform int i) {
    return __extract_int32(x, (unsigned int)i);
 }

@@ -175,12 +223,30 @@ static inline float insert(float x, uniform int i, uniform float v) {
    return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
 }

-static inline int insert(int x, uniform int i, uniform int v) {
+static inline int8 insert(int8 x, uniform int i, uniform int8 v) {
+    return __insert_int8(x, i, v);
+}
+
+static inline unsigned int8 insert(unsigned int8 x, uniform int i, 
+                                    uniform unsigned int8 v) {
+    return __insert_int8(x, (unsigned int)i, v);
+}
+
+static inline int16 insert(int16 x, uniform int i, uniform int16 v) {
+    return __insert_int16(x, i, v);
+}
+
+static inline unsigned int16 insert(unsigned int16 x, uniform int i, 
+                                    uniform unsigned int16 v) {
+    return __insert_int16(x, (unsigned int)i, v);
+}
+
+static inline int32 insert(int32 x, uniform int i, uniform int32 v) {
    return __insert_int32(x, i, v);
 }

-static inline unsigned int insert(unsigned int x, uniform int i, 
-                                  uniform unsigned int v) {
+static inline unsigned int32 insert(unsigned int32 x, uniform int i, 
+                                    uniform unsigned int32 v) {
    return __insert_int32(x, (unsigned int)i, v);
 }

@@ -218,7 +284,7 @@ static inline uniform bool all(bool v) {
    return __movmsk(match) == (1 << programCount) - 1;
 }

-static inline uniform int popcnt(uniform int v) {
+static inline uniform int32 popcnt(uniform int32 v) {
    return __popcnt_int32(v);
 }

@@ -473,52 +539,7 @@ ATOMIC_DECL_CMPXCHG(unsigned int64, int64)
 ATOMIC_DECL_CMPXCHG(double, double)

 ///////////////////////////////////////////////////////////////////////////
-// Load/store from/to 8/16-bit types
-
-static inline int load_from_int8(uniform int a[], uniform int offset) {
-    return __load_int8(a, offset, __mask);
-}
-
-static inline unsigned int load_from_uint8(uniform unsigned int a[], 
-                                           uniform int offset) {
-    return __load_uint8(a, offset, __mask);
-}
-
-static inline void store_to_int8(uniform int a[], uniform int offset, 
-                                 unsigned int val) {
-    __store_int8(a, offset, val, __mask);
-}
-
-static inline void store_to_uint8(uniform unsigned int a[], uniform int offset, 
-                                  unsigned int val) {
-    // Can use __store_int8 for unsigned stuff, since it truncates bits in
-    // either case.
-    __store_int8(a, offset, val, __mask);
-}
-
-static inline int load_from_int16(uniform int a[], uniform int offset) {
-    return __load_int16(a, offset, __mask);
-}
-
-static inline unsigned int load_from_int16(uniform unsigned int a[], 
-                                           uniform int offset) {
-    return __load_uint16(a, offset, __mask);
-}
-
-static inline void store_to_int16(uniform int a[], uniform int offset, 
-                                  int val) {
-    __store_int16(a, offset, val, __mask);
-}
-
-static inline void store_to_uint16(uniform unsigned int a[], uniform int offset, 
-                                   unsigned int val) {
-    // Can use __store_int16 for unsigned stuff, since it truncates bits in
-    // either case.
-    __store_int16(a, offset, val, __mask);
-}
-
-///////////////////////////////////////////////////////////////////////////
-// Math
+// Floating-Point Math

 static inline float abs(float a) {
    // Floating-point hack: zeroing the high bit clears the sign
@@ -622,6 +643,11 @@ static inline uniform float rcp(uniform float v) {
    return __rcp_uniform_float(v);
 }

+///////////////////////////////////////////////////////////////////////////
+// min/max
+
+// float
+
 static inline float min(float a, float b) {
    return __min_varying_float(a, b);
 }
@@ -630,14 +656,6 @@ static inline uniform float min(uniform float a, uniform float b) {
    return __min_uniform_float(a, b);
 }

-static inline double min(double a, double b) {
-    return __min_varying_double(a, b);
-}
-
-static inline uniform double min(uniform double a, uniform double b) {
-    return __min_uniform_double(a, b);
-}
-
 static inline float max(float a, float b) {
    return __max_varying_float(a, b);
 }
@@ -646,6 +664,17 @@ static inline uniform float max(uniform float a, uniform float b) {
    return __max_uniform_float(a, b);
 }

+
+// double
+
+static inline double min(double a, double b) {
+    return __min_varying_double(a, b);
+}
+
+static inline uniform double min(uniform double a, uniform double b) {
+    return __min_uniform_double(a, b);
+}
+
 static inline double max(double a, double b) {
    return __max_varying_double(a, b);
 }
@@ -654,6 +683,80 @@ static inline uniform double max(uniform double a, uniform double b) {
    return __max_uniform_double(a, b);
 }

+// int8
+
+static inline uniform unsigned int8 min(uniform unsigned int8 a,
+                                        uniform unsigned int8 b) {
+    return (a < b) ? a : b;
+}
+
+static inline uniform unsigned int8 max(uniform unsigned int8 a, 
+                                        uniform unsigned int8 b) {
+    return (a > b) ? a : b;
+}
+
+static inline uniform int8 min(uniform int8 a, uniform int8 b) {
+    return (a < b) ? a : b;
+}
+
+static inline uniform int8 max(uniform int8 a, uniform int8 b) {
+    return (a > b) ? a : b;
+}
+
+static inline unsigned int8 min(unsigned int8 a, unsigned int8 b) {
+    return (a < b) ? a : b;
+}
+
+static inline unsigned int8 max(unsigned int8 a, unsigned int8 b) {
+    return (a > b) ? a : b;
+}
+
+static inline int8 min(int8 a, int8 b) {
+    return (a < b) ? a : b;
+}
+
+static inline int8 max(int8 a, int8 b) {
+    return (a > b) ? a : b;
+}
+
+// int16
+
+static inline uniform unsigned int16 min(uniform unsigned int16 a, 
+                                         uniform unsigned int16 b) {
+    return (a < b) ? a : b;
+}
+
+static inline uniform unsigned int16 max(uniform unsigned int16 a, 
+                                         uniform unsigned int16 b) {
+    return (a > b) ? a : b;
+}
+
+static inline uniform int16 min(uniform int16 a, uniform int16 b) {
+    return (a < b) ? a : b;
+}
+
+static inline uniform int16 max(uniform int16 a, uniform int16 b) {
+    return (a > b) ? a : b;
+}
+
+static inline unsigned int16 min(unsigned int16 a, unsigned int16 b) {
+    return (a < b) ? a : b;
+}
+
+static inline unsigned int16 max(unsigned int16 a, unsigned int16 b) {
+    return (a > b) ? a : b;
+}
+
+static inline int16 min(int16 a, int16 b) {
+    return (a < b) ? a : b;
+}
+
+static inline int16 max(int16 a, int16 b) {
+    return (a > b) ? a : b;
+}
+
+// int32
+
 static inline unsigned int min(unsigned int a, unsigned int b) {
    return __min_varying_uint32(a, b);
 }
@@ -686,6 +789,8 @@ static inline uniform int max(uniform int a, uniform int b) {
    return __max_uniform_int32(a, b);
 }

+// int64
+
 static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
    return __min_varying_uint64(a, b);
 }
@@ -718,6 +823,11 @@ static inline uniform int64 max(uniform int64 a, uniform int64 b) {
    return __max_uniform_int64(a, b);
 }

+///////////////////////////////////////////////////////////////////////////
+// clamps
+
+// float
+
 static inline float clamp(float v, float low, float high) {
    return min(max(v, low), high);
 }
@@ -726,6 +836,52 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
    return min(max(v, low), high);
 }

+// int8
+
+static inline unsigned int8 clamp(unsigned int8 v, unsigned int8 low, 
+                                   unsigned int8 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int8 clamp(uniform unsigned int8 v, 
+                                           uniform unsigned int8 low, 
+                                           uniform unsigned int8 high) {
+    return min(max(v, low), high);
+}
+
+static inline int8 clamp(int8 v, int8 low, int8 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform int8 clamp(uniform int8 v, uniform int8 low, 
+                                  uniform int8 high) {
+    return min(max(v, low), high);
+}
+
+// int16
+
+static inline unsigned int16 clamp(unsigned int16 v, unsigned int16 low, 
+                                   unsigned int16 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int16 clamp(uniform unsigned int16 v, 
+                                           uniform unsigned int16 low, 
+                                           uniform unsigned int16 high) {
+    return min(max(v, low), high);
+}
+
+static inline int16 clamp(int16 v, int16 low, int16 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform int16 clamp(uniform int16 v, uniform int16 low, 
+                                  uniform int16 high) {
+    return min(max(v, low), high);
+}
+
+// int32
+
 static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int high) {
    return min(max(v, low), high);
 }
@@ -735,15 +891,6 @@ static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigne
    return min(max(v, low), high);
 }

-static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, unsigned int64 high) {
-    return min(max(v, low), high);
-}
-
-static inline uniform unsigned int64 clamp(uniform unsigned int64 v, uniform unsigned int64 low, 
-                                           uniform unsigned int64 high) {
-    return min(max(v, low), high);
-}
-
 static inline int clamp(int v, int low, int high) {
    return min(max(v, low), high);
 }
@@ -752,11 +899,25 @@ static inline uniform int clamp(uniform int v, uniform int low, uniform int high
    return min(max(v, low), high);
 }

+// int64
+
+static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, 
+                                   unsigned int64 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int64 clamp(uniform unsigned int64 v, 
+                                           uniform unsigned int64 low, 
+                                           uniform unsigned int64 high) {
+    return min(max(v, low), high);
+}
+
 static inline int64 clamp(int64 v, int64 low, int64 high) {
    return min(max(v, low), high);
 }

-static inline uniform int64 clamp(uniform int64 v, uniform int64 low, uniform int64 high) {
+static inline uniform int64 clamp(uniform int64 v, uniform int64 low, 
+                                  uniform int64 high) {
    return min(max(v, low), high);
 }

--- a/stdlib.m4
+++ b/stdlib.m4
@@ -566,6 +566,28 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops

+define internal i8 @__extract_int8(<$1 x i8>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i8> %0, i32 %1
+  ret i8 %extract
+}
+
+define internal <$1 x i8> @__insert_int8(<$1 x i8>, i32, 
+                                           i8) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i8> %0, i8 %2, i32 %1
+  ret <$1 x i8> %insert
+}
+
+define internal i16 @__extract_int16(<$1 x i16>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i16> %0, i32 %1
+  ret i16 %extract
+}
+
+define internal <$1 x i16> @__insert_int16(<$1 x i16>, i32, 
+                                           i16) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i16> %0, i16 %2, i32 %1
+  ret <$1 x i16> %insert
+}
+
 define internal i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
  %extract = extractelement <$1 x i32> %0, i32 %1
  ret i32 %extract
@@ -588,6 +610,8 @@ define internal <$1 x i64> @__insert_int64(<$1 x i64>, i32,
  ret <$1 x i64> %insert
 }

+shuffles($1, i8, int8, 1)
+shuffles($1, i16, int16, 2)
 shuffles($1, float, float, 4)
 shuffles($1, i32, int32, 4)
 shuffles($1, double, double, 8)
@@ -901,171 +925,6 @@ i64minmax($1,min,uint64,ult)
 i64minmax($1,max,uint64,ugt)
 ')

-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; Definitions of 8 and 16-bit load and store functions
-;;
-;; The `int8_16' macro defines functions related to loading and storing 8 and
-;; 16-bit values in memory, converting to and from i32.  (This is a workaround
-;; to be able to use in-memory values of types in ispc programs, since the
-;; compiler doesn't yet support 8 and 16-bit datatypes...
-;;
-;; Arguments to pass to `int8_16':
-;; $1: vector width of the target
-
-define(`int8_16', `
-define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %doload, label %skip
-
-doload:  
-  %ptr8 = bitcast [0 x i32] *%0 to i8 *
-  %ptr = getelementptr i8 * %ptr8, i32 %offset
-  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
-  %val = load i`'eval(8*$1) * %ptr64, align 1
-
-  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
-  ; unsigned, so zero-extend to i32... 
-  %ret = zext <$1 x i8> %vval to <$1 x i32>
-  ret <$1 x i32> %ret
-
-skip:
-  ret <$1 x i32> undef
-}
-
-
-define internal <$1 x i32> @__load_int8([0 x i32] *, i32 %offset,
-                                        <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %doload, label %skip
-
-doload:  
-  %ptr8 = bitcast [0 x i32] *%0 to i8 *
-  %ptr = getelementptr i8 * %ptr8, i32 %offset
-  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
-  %val = load i`'eval(8*$1) * %ptr64, align 1
-
-  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
-  ; signed, so sign-extend to i32... 
-  %ret = sext <$1 x i8> %vval to <$1 x i32>
-  ret <$1 x i32> %ret
-
-skip:
-  ret <$1 x i32> undef
-}
-
-
-define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset,
-                                          <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %doload, label %skip
-
-doload:  
-  %ptr16 = bitcast [0 x i32] *%0 to i16 *
-  %ptr = getelementptr i16 * %ptr16, i32 %offset
-  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-  %val = load i`'eval(16*$1) * %ptr64, align 2
-
-  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
-  ; unsigned, so use zero-extend...
-  %ret = zext <$1 x i16> %vval to <$1 x i32>
-  ret <$1 x i32> %ret
-
-skip:
-  ret <$1 x i32> undef
-}
-
-
-define internal <$1 x i32> @__load_int16([0 x i32] *, i32 %offset,
-                                         <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %doload, label %skip
-
-doload:  
-  %ptr16 = bitcast [0 x i32] *%0 to i16 *
-  %ptr = getelementptr i16 * %ptr16, i32 %offset
-  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-  %val = load i`'eval(16*$1) * %ptr64, align 2
-
-  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
-  ; signed, so use sign-extend...
-  %ret = sext <$1 x i16> %vval to <$1 x i32>
-  ret <$1 x i32> %ret
-
-skip:
-  ret <$1 x i32> undef
-}
-
-
-define internal void @__store_int8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
-                                   <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %dostore, label %skip
-
-dostore:  
-  %val = trunc <$1 x i32> %val32 to <$1 x i8>
-  %val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
-
-  %mask8 = trunc <$1 x i32> %mask to <$1 x i8>
-  %mask64 = bitcast <$1 x i8> %mask8 to i`'eval(8*$1)
-  %notmask = xor i`'eval(8*$1) %mask64, -1
-
-  %ptr8 = bitcast [0 x i32] *%0 to i8 *
-  %ptr = getelementptr i8 * %ptr8, i32 %offset
-  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
-
-  ;; load the old value, use logical ops to blend based on the mask, then
-  ;; store the result back
-  %old = load i`'eval(8*$1) * %ptr64, align 1
-  %oldmasked = and i`'eval(8*$1) %old, %notmask
-  %newmasked = and i`'eval(8*$1) %val64, %mask64
-  %final = or i`'eval(8*$1) %oldmasked, %newmasked
-  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
-
-  ret void
-
-skip:
-  ret void
-}
-
-define internal void @__store_int16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
-                                    <$1 x i32> %mask) nounwind alwaysinline {
-  %mm = call i32 @__movmsk(<$1 x i32> %mask)
-  %any = icmp ne i32 %mm, 0
-  br i1 %any, label %dostore, label %skip
-
-dostore:
-  %val = trunc <$1 x i32> %val32 to <$1 x i16>
-  %val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
-
-  %mask8 = trunc <$1 x i32> %mask to <$1 x i16>
-  %mask64 = bitcast <$1 x i16> %mask8 to i`'eval(16*$1)
-  %notmask = xor i`'eval(16*$1) %mask64, -1
-
-  %ptr16 = bitcast [0 x i32] *%0 to i16 *
-  %ptr = getelementptr i16 * %ptr16, i32 %offset
-  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-
-  ;; as above, use mask to do blending with logical ops...
-  %old = load i`'eval(16*$1) * %ptr64, align 2
-  %oldmasked = and i`'eval(16*$1) %old, %notmask
-  %newmasked = and i`'eval(16*$1) %val64, %mask64
-  %final = or i`'eval(16*$1) %oldmasked, %newmasked
-  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
-
-  ret void
-
-skip:
-  ret void
-}
-'
-)
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Emit code to safely load a scalar value and broadcast it across the
 ;; elements of a vector.  Parameters:
@@ -1150,6 +1009,105 @@ return:
 }
 ')

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+;; emit code to do masked store as a set of per-lane scalar stores
+;; parameters:
+;; $1: target vector width
+;; $2: llvm type of elements
+;; $3: suffix for function name
+
+define(`gen_masked_store', `
+define void @__masked_store_$3(<$1 x $2>* nocapture, <$1 x $2>, <$1 x i32>) nounwind alwaysinline {
+  per_lane($1, <$1 x i32> %2, `
+      %ptr_ID = getelementptr <$1 x $2> * %0, i32 0, i32 LANE
+      %storeval_ID = extractelement <$1 x $2> %1, i32 LANE
+      store $2 %storeval_ID, $2 * %ptr_ID')
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_4', `
+define void @__masked_store_blend_8(<4 x i8>* nocapture, <4 x i8>,
+                                    <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0
+  %old32 = bitcast <4 x i8> %old to i32
+  %new32 = bitcast <4 x i8> %1 to i32
+
+  %mask8 = trunc <4 x i32> %2 to <4 x i8>
+  %mask32 = bitcast <4 x i8> %mask8 to i32
+  %notmask32 = xor i32 %mask32, -1
+
+  %newmasked = and i32 %new32, %mask32
+  %oldmasked = and i32 %old32, %notmask32
+  %result = or i32 %newmasked, %oldmasked
+
+  %resultvec = bitcast i32 %result to <4 x i8>
+  store <4 x i8> %resultvec, <4 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<4 x i16>* nocapture, <4 x i16>,
+                                     <4 x i32>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0
+  %old64 = bitcast <4 x i16> %old to i64
+  %new64 = bitcast <4 x i16> %1 to i64
+
+  %mask16 = trunc <4 x i32> %2 to <4 x i16>
+  %mask64 = bitcast <4 x i16> %mask16 to i64
+  %notmask64 = xor i64 %mask64, -1
+
+  %newmasked = and i64 %new64, %mask64
+  %oldmasked = and i64 %old64, %notmask64
+  %result = or i64 %newmasked, %oldmasked
+
+  %resultvec = bitcast i64 %result to <4 x i16>
+  store <4 x i16> %resultvec, <4 x i16> * %0
+  ret void
+}
+')
+
+define(`masked_store_blend_8_16_by_8', `
+define void @__masked_store_blend_8(<8 x i8>* nocapture, <8 x i8>,
+                                    <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i8> * %0
+  %old64 = bitcast <8 x i8> %old to i64
+  %new64 = bitcast <8 x i8> %1 to i64
+
+  %mask8 = trunc <8 x i32> %2 to <8 x i8>
+  %mask64 = bitcast <8 x i8> %mask8 to i64
+  %notmask64 = xor i64 %mask64, -1
+
+  %newmasked = and i64 %new64, %mask64
+  %oldmasked = and i64 %old64, %notmask64
+  %result = or i64 %newmasked, %oldmasked
+
+  %resultvec = bitcast i64 %result to <8 x i8>
+  store <8 x i8> %resultvec, <8 x i8> * %0
+  ret void
+}
+
+define void @__masked_store_blend_16(<8 x i16>* nocapture, <8 x i16>,
+                                     <8 x i32>) nounwind alwaysinline {
+  %old = load <8 x i16> * %0
+  %old128 = bitcast <8 x i16> %old to i128
+  %new128 = bitcast <8 x i16> %1 to i128
+
+  %mask16 = trunc <8 x i32> %2 to <8 x i16>
+  %mask128 = bitcast <8 x i16> %mask16 to i128
+  %notmask128 = xor i128 %mask128, -1
+
+  %newmasked = and i128 %new128, %mask128
+  %oldmasked = and i128 %old128, %notmask128
+  %result = or i128 %newmasked, %oldmasked
+
+  %resultvec = bitcast i128 %result to <8 x i16>
+  store <8 x i16> %resultvec, <8 x i16> * %0
+  ret void
+}
+')
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; packed load and store functions
 ;;
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -1405,6 +1405,18 @@ lProcessPrintArg(Expr *expr, FunctionEmitContext *ctx, std::string &argTypes) {
            return NULL;
    }

+    // Just int8 and int16 types to int32s...
+    const Type *baseType = type->GetAsNonConstType()->GetAsUniformType();
+    if (baseType == AtomicType::UniformInt8 ||
+        baseType == AtomicType::UniformUInt8 ||
+        baseType == AtomicType::UniformInt16 ||
+        baseType == AtomicType::UniformUInt16) {
+        expr = new TypeCastExpr(type->IsUniformType() ? AtomicType::UniformInt32 :
+                                                        AtomicType::VaryingInt32, 
+                                expr, expr->pos);
+        type = expr->GetType();
+    }
+        
    char t = lEncodeType(type->GetAsNonConstType());
    if (t == '\0') {
        Error(expr->pos, "Only atomic types are allowed in print statements; "
--- a/tests/array-mixed-unif-vary-indexing-2.ispc
+++ b/tests/array-mixed-unif-vary-indexing-2.ispc
@@ -8,7 +8,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;

    // all are 2 except (3,4) = 0, (1,4) = 1, (2,4) = 1, (4,4) = 1
    if (a == 3.)
--- a/tests/array-mixed-unif-vary-indexing-3.ispc
+++ b/tests/array-mixed-unif-vary-indexing-3.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;

    // all are 2 except (4,2) = 0, (4,...) = 1, (4,programCount-1)=2
    if (a == 3.)
--- a/tests/array-mixed-unif-vary-indexing.ispc
+++ b/tests/array-mixed-unif-vary-indexing.ispc
@@ -8,7 +8,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform float x[47][47];
    for (uniform int i = 0; i < 47; ++i)
        for (uniform int j = 0; j < 47; ++j)
-            x[i][j] = 2;
+            x[i][j] = 2+b-5;

    x[a][b-1] = 0;
    RET[programIndex] = x[2][a];
--- a/tests/broadcast-2.ispc
+++ b/tests/broadcast-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int16 a = aFOO[programIndex]; 
+    int16 b = broadcast(a, 2);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
--- a/tests/broadcast-3.ispc
+++ b/tests/broadcast-3.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 a = aFOO[programIndex]; 
+    int8 br = broadcast(a, (uniform int)b-2);
+    RET[programIndex] = br;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
--- a/tests/gather-int16-1.ispc
+++ b/tests/gather-int16-1.ispc
@@ -0,0 +1,19 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 x[programCount];
+    x[programIndex] = programIndex;
+    int a = aFOO[programIndex]-1;
+    unsigned int16 v;
+    if (programIndex < 2)
+        v = x[a];
+    else
+        v = 2;
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+    RET[0] = 0;
+    RET[1] = 1;
+}
--- a/tests/gather-int16.ispc
+++ b/tests/gather-int16.ispc
@@ -0,0 +1,13 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 x[programCount];
+    x[programIndex] = programIndex;
+    int a = aFOO[programIndex]-1;
+    unsigned int16 v = x[a];
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/gather-int8-1.ispc
+++ b/tests/gather-int8-1.ispc
@@ -0,0 +1,19 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 x[programCount];
+    x[programIndex] = programIndex;
+    int a = aFOO[programIndex]-1;
+    unsigned int8 v;
+    if (programIndex < 2)
+        v = x[a];
+    else
+        v = 2;
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+    RET[0] = 0;
+    RET[1] = 1;
+}
--- a/tests/gather-int8.ispc
+++ b/tests/gather-int8.ispc
@@ -0,0 +1,13 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 x[programCount];
+    x[programIndex] = programIndex;
+    int a = aFOO[programIndex]-1;
+    unsigned int8 v = x[a];
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/int16-wrap.ispc
+++ b/tests/int16-wrap.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bb) {
+    unsigned int16 a = aFOO[programIndex], b = bb;
+    RET[programIndex] = ((unsigned int16)4000*a)+b; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (((4000*(programIndex+1))&0xffff)+5)&0xffff;
+}
--- a/tests/int8-wrap.ispc
+++ b/tests/int8-wrap.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float bb) {
+    unsigned int8 a = aFOO[programIndex], b = bb;
+    RET[programIndex] = ((unsigned int8)100*a)+b; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (((100*(programIndex+1))&0xff)+5)&0xff;
+}
--- a/tests/load-int16-1.ispc
+++ b/tests/load-int16-1.ispc
@@ -1,13 +1,17 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[9] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
-                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f,
-                         0x00120011 };
-    unsigned int v = load_from_int16(x, 1);
+    uniform int16 x[programCount];
+    x[programIndex] = aFOO[programIndex];
+    unsigned int16 v = 0;
+    if (programIndex & 1)
+        v = x[programIndex];
    RET[programIndex] = v;
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
+    if (programIndex & 1)
+        RET[programIndex] = 1+programIndex;
+    else
+        RET[programIndex] = 0;
 }
--- a/tests/load-int16.ispc
+++ b/tests/load-int16.ispc
@@ -1,9 +1,9 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[8] = { 0x00020001, 0x00040003, 0x00060005, 0x00080007,
-                         0x000a0009, 0x000c000b, 0x000e000d, 0x0010000f };
-    unsigned int v = load_from_int16(x, 0);
+    uniform int16 x[programCount];
+    x[programIndex] = aFOO[programIndex];
+    unsigned int16 v = x[programIndex];
    RET[programIndex] = v;
 }

--- a/tests/load-int8-1.ispc
+++ b/tests/load-int8-1.ispc
@@ -1,12 +1,17 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[5] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d,
-                         0x14131211 };
-    unsigned int v = load_from_int8(x, 2);
+    uniform int8 x[programCount];
+    x[programIndex] = aFOO[programIndex];
+    unsigned int8 v = 0;
+    if (programIndex & 1)
+        v = x[programIndex];
    RET[programIndex] = v;
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 3+programIndex;
+    if (programIndex & 1)
+        RET[programIndex] = 1+programIndex;
+    else
+        RET[programIndex] = 0;
 }
--- a/tests/load-int8.ispc
+++ b/tests/load-int8.ispc
@@ -1,8 +1,9 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[4] = { 0x04030201, 0x08070605, 0x0c0b0a09, 0x100f0e0d };
-    unsigned int v = load_from_int8(x, 0);
+    uniform int8 x[programCount];
+    x[programIndex] = aFOO[programIndex];
+    unsigned int8 v = x[programIndex];
    RET[programIndex] = v;
 }

--- a/tests/nested-structs-2.ispc
+++ b/tests/nested-structs-2.ispc
@@ -16,7 +16,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform Bar bar;
    for (uniform int i = 0; i < 6; ++i)
        for (uniform int j = 0; j < 18; ++j)
-            bar.foo[i].f[j] = 2.;
+            bar.foo[i].f[j] = 2.+b-5;

    bar.foo[5].f[a] = a;
    RET[programIndex] = bar.foo[b].f[a];
--- a/tests/nested-structs.ispc
+++ b/tests/nested-structs.ispc
@@ -1,8 +1,6 @@

 export uniform int width() { return programCount; }

-
-
 struct Foo {
    float f[6];
 };
@@ -16,7 +14,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform Bar bar;
    for (uniform int i = 0; i < 6; ++i)
        for (uniform int j = 0; j < 6; ++j)
-            bar.foo[i].f[j] = 2.;
+            bar.foo[i].f[j] = 2.+b-5;
    RET[programIndex] = bar.foo[b].f[b];
 }

--- a/tests/op-plus-equals-ensure-one-lhs-eval.ispc
+++ b/tests/op-plus-equals-ensure-one-lhs-eval.ispc
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform float foo[16];
    for (uniform int i = 0; i < 16; ++i)
-        foo[i] = 1;
+        foo[i] = i;

    uniform int i = 0;
    foo[i++] += 1;
--- a/tests/pass-varying-lvalue-to-ref.ispc
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -6,10 +6,10 @@ void inc(reference float v) { ++v; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform float foo[32];
    for (uniform int i = 0; i < 32; ++i)
-        foo[i] = 10;
+        foo[i] = 10+i;
    int a = (int)aa[programIndex];
    inc(foo[a]);
-    ret[programIndex] = foo[programIndex];
+    ret[programIndex] = foo[programIndex]-programIndex;
 }

 export void result(uniform float ret[]) {
--- a/tests/rotate-5.ispc
+++ b/tests/rotate-5.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 a = aFOO[programIndex]; 
+    int8 rot = rotate(a, 2);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + 2) % programCount;
+}
--- a/tests/rotate-6.ispc
+++ b/tests/rotate-6.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int16 a = aFOO[programIndex]; 
+    int16 rot = rotate(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
--- a/tests/scatter-int16-1.ispc
+++ b/tests/scatter-int16-1.ispc
@@ -0,0 +1,17 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 x[programCount];
+    x[programIndex] = -1;
+    int a = aFOO[programIndex]-1;
+    if (programIndex < 3)
+        x[a] = programIndex;
+    RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -1;
+    RET[0] = 0;
+    RET[1] = 1;
+    RET[2] = 2;
+}
--- a/tests/scatter-int16.ispc
+++ b/tests/scatter-int16.ispc
@@ -0,0 +1,13 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int16 x[programCount];
+    x[programIndex] = 0;
+    int a = aFOO[programIndex]-1;
+    x[a] = programIndex;
+    RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/scatter-int8-1.ispc
+++ b/tests/scatter-int8-1.ispc
@@ -0,0 +1,17 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 x[programCount];
+    x[programIndex] = -1;
+    int a = aFOO[programIndex]-1;
+    if (programIndex < 3)
+        x[a] = programIndex;
+    RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -1;
+    RET[0] = 0;
+    RET[1] = 1;
+    RET[2] = 2;
+}
--- a/tests/scatter-int8.ispc
+++ b/tests/scatter-int8.ispc
@@ -0,0 +1,13 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int8 x[programCount];
+    x[programIndex] = 0;
+    int a = aFOO[programIndex]-1;
+    x[a] = programIndex;
+    RET[programIndex] = x[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/shuffle-3.ispc
+++ b/tests/shuffle-3.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 a = aFOO[programIndex]; 
+    int8 shuf = shuffle(a, 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
--- a/tests/shuffle-4.ispc
+++ b/tests/shuffle-4.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int16 a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex;
+    int16 shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
--- a/tests/shuffle-5.ispc
+++ b/tests/shuffle-5.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex + (int)b - 5;
+    int8 shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
--- a/tests/shuffle2-11.ispc
+++ b/tests/shuffle2-11.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int16 aa = aFOO[programIndex]; 
+    int16 bb = aa + programCount;
+    int16 shuf = shuffle(aa, bb, 2*programIndex);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + 2*programIndex;
+}
--- a/tests/shuffle2-6.ispc
+++ b/tests/shuffle2-6.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 aa = aFOO[programIndex]; 
+    int8 bb = aa + programCount;
+    int8 shuf = shuffle(aa, bb, 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
--- a/tests/shuffle2-7.ispc
+++ b/tests/shuffle2-7.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int16 aa = aFOO[programIndex]; 
+    int16 bb = aa + programCount;
+    int16 shuf = shuffle(aa, bb, programCount + 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + programCount;
+}
--- a/tests/shuffle2-8.ispc
+++ b/tests/shuffle2-8.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int8 aa = aFOO[programIndex]; 
+    int8 bb = aa + programCount;
+    int8 shuf = shuffle(aa, bb, programIndex + 2);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
--- a/tests/shuffle2-9.ispc
+++ b/tests/shuffle2-9.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int16 aa = aFOO[programIndex]; 
+    int16 bb = aa + programCount;
+    int16 shuf = shuffle(aa, bb, programIndex + 2 + (int)b - 5);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3 + programIndex;
+}
--- a/tests/store-int16-1.ispc
+++ b/tests/store-int16-1.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[16];
-    for (uniform int i = 0; i < 16; ++i)
-        x[i] = 0xffffffff;
-    unsigned int val = aFOO[programIndex];
-    store_to_int16(x, 5, val);
-    unsigned int v = load_from_int16(x, 6);
-    RET[programIndex] = v;
+    uniform unsigned int16 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xffff;
+    unsigned int16 val = aFOO[programIndex];
+    x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
-    RET[programCount-1] = (unsigned int)0xffffffff;
+    RET[programIndex] = programIndex;
+    RET[0] = 65535;
 }
--- a/tests/store-int16-2.ispc
+++ b/tests/store-int16-2.ispc
@@ -0,0 +1,19 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int16 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xffff;
+    unsigned int16 val = aFOO[programIndex];
+    if (programIndex & 1)
+        x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex & 1)
+        RET[programIndex] = 65535;
+    else
+        RET[programIndex] = programIndex;
+    RET[0] = 65535;
+}
--- a/tests/store-int16.ispc
+++ b/tests/store-int16.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[16];
-    for (uniform int i = 0; i < 16; ++i)
-        x[i] = 0xffffffff;
-    unsigned int val = aFOO[programIndex];
-    store_to_int16(x, 5, val);
-    int v = load_from_int16(x, 6);
-    RET[programIndex] = v;
+    uniform int16 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xffff;
+    unsigned int8 val = aFOO[programIndex];
+    x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
 }

 export void result(uniform float RET[]) {
-    RET[programIndex] = 2+programIndex;
-    RET[programCount-1] = -1;
+    RET[programIndex] = programIndex;
+    RET[0] = -1.;
 }
--- a/tests/store-int8-1.ispc
+++ b/tests/store-int8-1.ispc
@@ -1,16 +1,15 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform unsigned int x[8];
-    for (uniform int i = 0; i < 8; ++i)
-        x[i] = 0xffffffff;
-    unsigned int val = aFOO[programIndex];
-    store_to_uint8(x, 2, val);
-    unsigned int v = load_from_uint8(x, 1);
-    RET[programIndex] = v;
+    uniform unsigned int8 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xff;
+    unsigned int8 val = aFOO[programIndex];
+    x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
 }

 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
-    RET[0] = (unsigned int)0xff;
+    RET[0] = 255;
 }
--- a/tests/store-int8-2.ispc
+++ b/tests/store-int8-2.ispc
@@ -0,0 +1,19 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int8 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xff;
+    unsigned int8 val = aFOO[programIndex];
+    if (programIndex & 1)
+        x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
+}
+
+export void result(uniform float RET[]) {
+    if (programIndex & 1)
+        RET[programIndex] = 255;
+    else
+        RET[programIndex] = programIndex;
+    RET[0] = 255;
+}
--- a/tests/store-int8.ispc
+++ b/tests/store-int8.ispc
@@ -1,13 +1,12 @@
 export uniform int width() { return programCount; }

 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    uniform int x[8];
-    for (uniform int i = 0; i < 8; ++i)
-        x[i] = 0xffffffff;
-    unsigned int val = aFOO[programIndex];
-    store_to_int8(x, 2, val);
-    int v = load_from_int8(x, 1);
-    RET[programIndex] = v;
+    uniform int8 x[2*programCount];
+    for (uniform int i = 0; i < 2*programCount; ++i)
+        x[i] = 0xff;
+    unsigned int8 val = aFOO[programIndex];
+    x[2+programIndex] = val;
+    RET[programIndex] = x[1+programIndex];
 }

 export void result(uniform float RET[]) {
--- a/tests/write-same-loc.ispc
+++ b/tests/write-same-loc.ispc
@@ -4,12 +4,12 @@ export uniform int width() { return programCount; }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform int foo[10];
    for (uniform int i = 0; i < 10; ++i)
-        foo[i] = 10;
+        foo[i] = 10+i;
    int bb = b;
    foo[bb] = 0;
    ret[programIndex] = foo[4] + foo[5];
 }

 export void result(uniform float ret[]) {
-    ret[programIndex] = 10;
+    ret[programIndex] = 14;
 }
--- a/type.cpp
+++ b/type.cpp
@@ -74,6 +74,14 @@ lShouldPrintName(const std::string &name) {

 const AtomicType *AtomicType::UniformBool = new AtomicType(TYPE_BOOL, true, false);
 const AtomicType *AtomicType::VaryingBool = new AtomicType(TYPE_BOOL, false, false);
+const AtomicType *AtomicType::UniformInt8 = new AtomicType(TYPE_INT8, true, false);
+const AtomicType *AtomicType::VaryingInt8 = new AtomicType(TYPE_INT8, false, false);
+const AtomicType *AtomicType::UniformUInt8 = new AtomicType(TYPE_UINT8, true, false);
+const AtomicType *AtomicType::VaryingUInt8 = new AtomicType(TYPE_UINT8, false, false);
+const AtomicType *AtomicType::UniformInt16 = new AtomicType(TYPE_INT16, true, false);
+const AtomicType *AtomicType::VaryingInt16 = new AtomicType(TYPE_INT16, false, false);
+const AtomicType *AtomicType::UniformUInt16 = new AtomicType(TYPE_UINT16, true, false);
+const AtomicType *AtomicType::VaryingUInt16 = new AtomicType(TYPE_UINT16, false, false);
 const AtomicType *AtomicType::UniformInt32 = new AtomicType(TYPE_INT32, true, false);
 const AtomicType *AtomicType::VaryingInt32 = new AtomicType(TYPE_INT32, false, false);
 const AtomicType *AtomicType::UniformUInt32 = new AtomicType(TYPE_UINT32, true, false);
@@ -89,6 +97,14 @@ const AtomicType *AtomicType::VaryingDouble = new AtomicType(TYPE_DOUBLE, false,

 const AtomicType *AtomicType::UniformConstBool = new AtomicType(TYPE_BOOL, true, true);
 const AtomicType *AtomicType::VaryingConstBool = new AtomicType(TYPE_BOOL, false, true);
+const AtomicType *AtomicType::UniformConstInt8 = new AtomicType(TYPE_INT8, true, true);
+const AtomicType *AtomicType::VaryingConstInt8 = new AtomicType(TYPE_INT8, false, true);
+const AtomicType *AtomicType::UniformConstUInt8 = new AtomicType(TYPE_UINT8, true, true);
+const AtomicType *AtomicType::VaryingConstUInt8 = new AtomicType(TYPE_UINT8, false, true);
+const AtomicType *AtomicType::UniformConstInt16 = new AtomicType(TYPE_INT16, true, true);
+const AtomicType *AtomicType::VaryingConstInt16 = new AtomicType(TYPE_INT16, false, true);
+const AtomicType *AtomicType::UniformConstUInt16 = new AtomicType(TYPE_UINT16, true, true);
+const AtomicType *AtomicType::VaryingConstUInt16 = new AtomicType(TYPE_UINT16, false, true);
 const AtomicType *AtomicType::UniformConstInt32 = new AtomicType(TYPE_INT32, true, true);
 const AtomicType *AtomicType::VaryingConstInt32 = new AtomicType(TYPE_INT32, false, true);
 const AtomicType *AtomicType::UniformConstUInt32 = new AtomicType(TYPE_UINT32, true, true);
@@ -101,6 +117,7 @@ const AtomicType *AtomicType::UniformConstUInt64 = new AtomicType(TYPE_UINT64, t
 const AtomicType *AtomicType::VaryingConstUInt64 = new AtomicType(TYPE_UINT64, false, true);
 const AtomicType *AtomicType::UniformConstDouble = new AtomicType(TYPE_DOUBLE, true, true);
 const AtomicType *AtomicType::VaryingConstDouble = new AtomicType(TYPE_DOUBLE, false, true);
+
 const AtomicType *AtomicType::Void = new AtomicType(TYPE_VOID, true, false);


@@ -123,14 +140,17 @@ AtomicType::IsFloatType() const {

 bool
 AtomicType::IsIntType() const {
-    return (basicType == TYPE_INT32 || basicType == TYPE_UINT32 ||
+    return (basicType == TYPE_INT8  || basicType == TYPE_UINT8  ||
+            basicType == TYPE_INT16 || basicType == TYPE_UINT16 ||
+            basicType == TYPE_INT32 || basicType == TYPE_UINT32 ||
            basicType == TYPE_INT64 || basicType == TYPE_UINT64);
 }


 bool
 AtomicType::IsUnsignedType() const {
-    return (basicType == TYPE_UINT32 || basicType == TYPE_UINT64);
+    return (basicType == TYPE_UINT8  || basicType == TYPE_UINT16 ||
+            basicType == TYPE_UINT32 || basicType == TYPE_UINT64);
 }


@@ -151,10 +171,18 @@ AtomicType::GetAsUnsignedType() const {
    if (IsUnsignedType()) 
        return this;

-    if (this == AtomicType::UniformInt32)           return AtomicType::UniformUInt32;
+    if      (this == AtomicType::UniformInt8)       return AtomicType::UniformUInt8;
+    else if (this == AtomicType::VaryingInt8)       return AtomicType::VaryingUInt8;
+    else if (this == AtomicType::UniformInt16)      return AtomicType::UniformUInt16;
+    else if (this == AtomicType::VaryingInt16)      return AtomicType::VaryingUInt16;
+    else if (this == AtomicType::UniformInt32)      return AtomicType::UniformUInt32;
    else if (this == AtomicType::VaryingInt32)      return AtomicType::VaryingUInt32;
    else if (this == AtomicType::UniformInt64)      return AtomicType::UniformUInt64;
    else if (this == AtomicType::VaryingInt64)      return AtomicType::VaryingUInt64;
+    else if (this == AtomicType::UniformConstInt8)  return AtomicType::UniformConstUInt8;
+    else if (this == AtomicType::VaryingConstInt8)  return AtomicType::VaryingConstUInt8;
+    else if (this == AtomicType::UniformConstInt16) return AtomicType::UniformConstUInt16;
+    else if (this == AtomicType::VaryingConstInt16) return AtomicType::VaryingConstUInt16;
    else if (this == AtomicType::UniformConstInt32) return AtomicType::UniformConstUInt32;
    else if (this == AtomicType::VaryingConstInt32) return AtomicType::VaryingConstUInt32;
    else if (this == AtomicType::UniformConstInt64) return AtomicType::UniformConstUInt64;
@@ -170,6 +198,10 @@ AtomicType::GetAsConstType() const {

    switch (basicType) {
    case TYPE_BOOL:    return isUniform ? UniformConstBool   : VaryingConstBool;
+    case TYPE_INT8:    return isUniform ? UniformConstInt8   : VaryingConstInt8;
+    case TYPE_UINT8:   return isUniform ? UniformConstUInt8  : VaryingConstUInt8;
+    case TYPE_INT16:   return isUniform ? UniformConstInt16  : VaryingConstInt16;
+    case TYPE_UINT16:  return isUniform ? UniformConstUInt16 : VaryingConstUInt16;
    case TYPE_INT32:   return isUniform ? UniformConstInt32  : VaryingConstInt32;
    case TYPE_UINT32:  return isUniform ? UniformConstUInt32 : VaryingConstUInt32;
    case TYPE_FLOAT:   return isUniform ? UniformConstFloat  : VaryingConstFloat;
@@ -190,6 +222,10 @@ AtomicType::GetAsNonConstType() const {

    switch (basicType) {
    case TYPE_BOOL:    return isUniform ? UniformBool   : VaryingBool;
+    case TYPE_INT8:    return isUniform ? UniformInt8   : VaryingInt8;
+    case TYPE_UINT8:   return isUniform ? UniformUInt8  : VaryingUInt8;
+    case TYPE_INT16:   return isUniform ? UniformInt16  : VaryingInt16;
+    case TYPE_UINT16:  return isUniform ? UniformUInt16 : VaryingUInt16;
    case TYPE_INT32:   return isUniform ? UniformInt32  : VaryingInt32;
    case TYPE_UINT32:  return isUniform ? UniformUInt32 : VaryingUInt32;
    case TYPE_FLOAT:   return isUniform ? UniformFloat  : VaryingFloat;
@@ -216,13 +252,17 @@ AtomicType::GetAsVaryingType() const {

    switch (basicType) {
    case TYPE_VOID:   return this;
-    case TYPE_BOOL:   return isConst ? AtomicType::VaryingConstBool   : AtomicType::VaryingBool;
-    case TYPE_INT32:  return isConst ? AtomicType::VaryingConstInt32  : AtomicType::VaryingInt32;
-    case TYPE_UINT32: return isConst ? AtomicType::VaryingConstUInt32 : AtomicType::VaryingUInt32;
-    case TYPE_FLOAT:  return isConst ? AtomicType::VaryingConstFloat  : AtomicType::VaryingFloat;
-    case TYPE_INT64:  return isConst ? AtomicType::VaryingConstInt64  : AtomicType::VaryingInt64;
-    case TYPE_UINT64: return isConst ? AtomicType::VaryingConstUInt64 : AtomicType::VaryingUInt64;
-    case TYPE_DOUBLE: return isConst ? AtomicType::VaryingConstDouble : AtomicType::VaryingDouble;
+    case TYPE_BOOL:   return isConst ? VaryingConstBool   : VaryingBool;
+    case TYPE_INT8:   return isConst ? VaryingConstInt8   : VaryingInt8;
+    case TYPE_UINT8:  return isConst ? VaryingConstUInt8  : VaryingUInt8;
+    case TYPE_INT16:  return isConst ? VaryingConstInt16  : VaryingInt16;
+    case TYPE_UINT16: return isConst ? VaryingConstUInt16 : VaryingUInt16;
+    case TYPE_INT32:  return isConst ? VaryingConstInt32  : VaryingInt32;
+    case TYPE_UINT32: return isConst ? VaryingConstUInt32 : VaryingUInt32;
+    case TYPE_FLOAT:  return isConst ? VaryingConstFloat  : VaryingFloat;
+    case TYPE_INT64:  return isConst ? VaryingConstInt64  : VaryingInt64;
+    case TYPE_UINT64: return isConst ? VaryingConstUInt64 : VaryingUInt64;
+    case TYPE_DOUBLE: return isConst ? VaryingConstDouble : VaryingDouble;
    default:          FATAL("Logic error in AtomicType::GetAsVaryingType()");
    }
    return NULL;
@@ -236,13 +276,17 @@ AtomicType::GetAsUniformType() const {

    switch (basicType) {
    case TYPE_VOID:   return this;
-    case TYPE_BOOL:   return isConst ? AtomicType::UniformConstBool : AtomicType::UniformBool;
-    case TYPE_INT32:  return isConst ? AtomicType::UniformConstInt32 : AtomicType::UniformInt32;
-    case TYPE_UINT32: return isConst ? AtomicType::UniformConstUInt32 : AtomicType::UniformUInt32;
-    case TYPE_FLOAT:  return isConst ? AtomicType::UniformConstFloat : AtomicType::UniformFloat;
-    case TYPE_INT64:  return isConst ? AtomicType::UniformConstInt64 : AtomicType::UniformInt64;
-    case TYPE_UINT64: return isConst ? AtomicType::UniformConstUInt64 : AtomicType::UniformUInt64;
-    case TYPE_DOUBLE: return isConst ? AtomicType::UniformConstDouble : AtomicType::UniformDouble;
+    case TYPE_BOOL:   return isConst ? UniformConstBool   : UniformBool;
+    case TYPE_INT8:   return isConst ? UniformConstInt8   : UniformInt8;
+    case TYPE_UINT8:  return isConst ? UniformConstUInt8  : UniformUInt8;
+    case TYPE_INT16:  return isConst ? UniformConstInt16  : UniformInt16;
+    case TYPE_UINT16: return isConst ? UniformConstUInt16 : UniformUInt16;
+    case TYPE_INT32:  return isConst ? UniformConstInt32  : UniformInt32;
+    case TYPE_UINT32: return isConst ? UniformConstUInt32 : UniformUInt32;
+    case TYPE_FLOAT:  return isConst ? UniformConstFloat  : UniformFloat;
+    case TYPE_INT64:  return isConst ? UniformConstInt64  : UniformInt64;
+    case TYPE_UINT64: return isConst ? UniformConstUInt64 : UniformUInt64;
+    case TYPE_DOUBLE: return isConst ? UniformConstDouble : UniformDouble;
    default:          FATAL("Logic error in AtomicType::GetAsUniformType()");
    }
    return NULL;
@@ -267,6 +311,10 @@ AtomicType::GetString() const {
    switch (basicType) {
    case TYPE_VOID:   ret += "void";            break;
    case TYPE_BOOL:   ret += "bool";            break;
+    case TYPE_INT8:   ret += "int8";            break;
+    case TYPE_UINT8:  ret += "unsigned int8";   break;
+    case TYPE_INT16:  ret += "int16";           break;
+    case TYPE_UINT16: ret += "unsigned int16";  break;
    case TYPE_INT32:  ret += "int32";           break;
    case TYPE_UINT32: ret += "unsigned int32";  break;
    case TYPE_FLOAT:  ret += "float";           break;
@@ -288,6 +336,10 @@ AtomicType::Mangle() const {
    switch (basicType) {
    case TYPE_VOID:   ret += "v"; break;
    case TYPE_BOOL:   ret += "b"; break;
+    case TYPE_INT8:   ret += "t"; break;
+    case TYPE_UINT8:  ret += "T"; break;
+    case TYPE_INT16:  ret += "s"; break;
+    case TYPE_UINT16: ret += "S"; break;
    case TYPE_INT32:  ret += "i"; break;
    case TYPE_UINT32: ret += "u"; break;
    case TYPE_FLOAT:  ret += "f"; break;
@@ -309,12 +361,16 @@ AtomicType::GetCDeclaration(const std::string &name) const {
    switch (basicType) {
    case TYPE_VOID:   ret += "void";     break;
    case TYPE_BOOL:   ret += "bool";     break;
+    case TYPE_INT8:   ret += "int8_t";   break;
+    case TYPE_UINT8:  ret += "uint8_t";  break;
+    case TYPE_INT16:  ret += "int16_t";  break;
+    case TYPE_UINT16: ret += "uint16_t"; break;
    case TYPE_INT32:  ret += "int32_t";  break;
    case TYPE_UINT32: ret += "uint32_t"; break;
    case TYPE_FLOAT:  ret += "float";    break;
-    case TYPE_DOUBLE: ret += "double";   break;
    case TYPE_INT64:  ret += "int64_t";  break;
    case TYPE_UINT64: ret += "uint64_t"; break;
+    case TYPE_DOUBLE: ret += "double";   break;
    default: FATAL("Logic error in AtomicType::GetCDeclaration()");
    }

@@ -333,6 +389,12 @@ AtomicType::LLVMType(llvm::LLVMContext *ctx) const {
        return llvm::Type::getVoidTy(*ctx);
    case TYPE_BOOL:
        return isUniform ? LLVMTypes::BoolType : LLVMTypes::BoolVectorType;
+    case TYPE_INT8:
+    case TYPE_UINT8:
+        return isUniform ? LLVMTypes::Int8Type : LLVMTypes::Int8VectorType;
+    case TYPE_INT16:
+    case TYPE_UINT16:
+        return isUniform ? LLVMTypes::Int16Type : LLVMTypes::Int16VectorType;
    case TYPE_INT32:
    case TYPE_UINT32:
        return isUniform ? LLVMTypes::Int32Type : LLVMTypes::Int32VectorType;
@@ -364,6 +426,22 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
            return m->diBuilder->createBasicType("bool", 32 /* size */, 32 /* align */,
                                                 llvm::dwarf::DW_ATE_unsigned);
            break;
+        case TYPE_INT8:
+            return m->diBuilder->createBasicType("int8", 8 /* size */, 8 /* align */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT8:
+            return m->diBuilder->createBasicType("uint8", 8 /* size */, 8 /* align */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
+        case TYPE_INT16:
+            return m->diBuilder->createBasicType("int16", 16 /* size */, 16 /* align */,
+                                                 llvm::dwarf::DW_ATE_signed);
+            break;
+        case TYPE_UINT16:
+            return m->diBuilder->createBasicType("uint16", 16 /* size */, 16 /* align */,
+                                                 llvm::dwarf::DW_ATE_unsigned);
+            break;
        case TYPE_INT32:
            return m->diBuilder->createBasicType("int32", 32 /* size */, 32 /* align */,
                                                 llvm::dwarf::DW_ATE_signed);
--- a/type.h
+++ b/type.h
@@ -210,6 +210,10 @@ public:
    enum BasicType {
        TYPE_VOID,
        TYPE_BOOL,
+        TYPE_INT8,
+        TYPE_UINT8,
+        TYPE_INT16,
+        TYPE_UINT16,
        TYPE_INT32,
        TYPE_UINT32,
        TYPE_FLOAT,
@@ -221,14 +225,22 @@ public:
    const BasicType basicType;

    static const AtomicType *UniformBool, *VaryingBool;
+    static const AtomicType *UniformInt8, *VaryingInt8;
+    static const AtomicType *UniformInt16, *VaryingInt16;
    static const AtomicType *UniformInt32, *VaryingInt32;
+    static const AtomicType *UniformUInt8, *VaryingUInt8;
+    static const AtomicType *UniformUInt16, *VaryingUInt16;
    static const AtomicType *UniformUInt32, *VaryingUInt32;
    static const AtomicType *UniformFloat, *VaryingFloat;
    static const AtomicType *UniformInt64, *VaryingInt64;
    static const AtomicType *UniformUInt64, *VaryingUInt64;
    static const AtomicType *UniformDouble, *VaryingDouble;
    static const AtomicType *UniformConstBool, *VaryingConstBool;
+    static const AtomicType *UniformConstInt8, *VaryingConstInt8;
+    static const AtomicType *UniformConstInt16, *VaryingConstInt16;
    static const AtomicType *UniformConstInt32, *VaryingConstInt32;
+    static const AtomicType *UniformConstUInt8, *VaryingConstUInt8;
+    static const AtomicType *UniformConstUInt16, *VaryingConstUInt16;
    static const AtomicType *UniformConstUInt32, *VaryingConstUInt32;
    static const AtomicType *UniformConstFloat, *VaryingConstFloat;
    static const AtomicType *UniformConstInt64, *VaryingConstInt64;