From c14c3ceba62d2d3fb56ed149bf509adb179b5050 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 4 Jul 2011 12:07:00 +0100
Subject: [PATCH] Provide both signed and unsigned int variants of
 bitcode-based builtins.

When creating function Symbols for functions that were defined in LLVM bitcode for the standard library, if any of the function parameters are integer types, create two ispc-side Symbols: one where the integer types are all signed and the other where they are all unsigned.  This allows us to provide, for example, both store_to_int16(reference int a[], uniform int offset, int val) as well as store_to_int16(reference unsigned int a[], uniform int offset, unsigned int val). functions.

Added some additional tests to exercise the new variants of these.

Also fixed some cases where the __{load,store}_int{8,16} builtins would read from/write to memory even if the mask was all off (which could cause crashes in some cases.)
---
 Makefile                 |  9 ++--
 builtins.cpp             | 86 +++++++++++++++++++++++-------------
 ctx.cpp                  |  3 +-
 docs/ReleaseNotes.txt    |  6 +++
 docs/ispc.txt            | 62 ++++++++++++++++----------
 stdlib.ispc              | 62 ++++++++++++++++++++------
 stdlib.m4                | 95 ++++++++++++++++++++++++++++++++++++----
 tests/packed-load-1.ispc |  4 +-
 tests/packed-store.ispc  |  4 +-
 tests/shuffle2.ispc      | 13 ++++++
 tests/store-int16-1.ispc | 16 +++++++
 tests/store-int16.ispc   |  4 +-
 tests/store-int8-1.ispc  | 16 +++++++
 tests/store-int8.ispc    |  4 +-
 14 files changed, 293 insertions(+), 91 deletions(-)
 create mode 100644 tests/shuffle2.ispc
 create mode 100644 tests/store-int16-1.ispc
 create mode 100644 tests/store-int8-1.ispc

diff --git a/Makefile b/Makefile
index ec541339..e5d0541c 100644
--- a/Makefile
+++ b/Makefile
@@ -6,12 +6,9 @@ ARCH_OS = $(shell uname)
 ARCH_TYPE = $(shell arch)
 
 CLANG=clang
-CLANG_LIBS = -lclangFrontendTool -lclangFrontend -lclangDriver \
-             -lclangSerialization -lclangCodeGen -lclangParse -lclangSema \
-             -lclangStaticAnalyzerFrontend -lclangStaticAnalyzerCheckers \
-             -lclangStaticAnalyzerCore \
-             -lclangAnalysis -lclangIndex -lclangRewrite \
-             -lclangAST -lclangLex -lclangBasic
+CLANG_LIBS = -lclangFrontend -lclangDriver \
+             -lclangSerialization -lclangParse -lclangSema \
+             -lclangAnalysis -lclangAST -lclangLex -lclangBasic
 
 LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl
 LLVM_CXXFLAGS=$(shell llvm-config --cppflags)
diff --git a/builtins.cpp b/builtins.cpp
index 9537dbf8..454d79d7 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -64,41 +64,46 @@ extern yy_buffer_state *yy_scan_string(const char *);
 /** Given an LLVM type, try to find the equivalent ispc type.  Note that
     this is an under-constrained problem due to LLVM's type representations
     carrying less information than ispc's.  (For example, LLVM doesn't
-    distinguish between signed and unsigned integers in its types.)  
+    distinguish between signed and unsigned integers in its types.)
+
+    Because this function is only used for generating ispc declarations of
+    functions defined in LLVM bitcode in the stdlib-*.ll files, in practice
+    we can get enough of what we need for the relevant cases to make things
+    work, partially with the help of the intAsUnsigned parameter, which
+    indicates whether LLVM integer types should be treated as being signed
+    or unsigned.
 
-    However, because this function is only used for generating ispc
-    declarations of functions defined in LLVM bitcode in the stdlib-*.ll
-    files, in practice we can get enough of what we need for the relevant
-    cases to make things work.
  */
 static const Type *
-lLLVMTypeToISPCType(const llvm::Type *t) {
+lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
     if (t == LLVMTypes::VoidType)
         return AtomicType::Void;
     else if (t == LLVMTypes::BoolType)
         return AtomicType::UniformBool;
     else if (t == LLVMTypes::Int32Type)
-        return AtomicType::UniformInt32;
+        return intAsUnsigned ? AtomicType::UniformUInt32 : AtomicType::UniformInt32;
     else if (t == LLVMTypes::FloatType)
         return AtomicType::UniformFloat;
     else if (t == LLVMTypes::DoubleType)
         return AtomicType::UniformDouble;
     else if (t == LLVMTypes::Int64Type)
-        return AtomicType::UniformInt64;
+        return intAsUnsigned ? AtomicType::UniformUInt64 : AtomicType::UniformInt64;
     else if (t == LLVMTypes::Int32VectorType)
-        return AtomicType::VaryingInt32;
+        return intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32;
     else if (t == LLVMTypes::FloatVectorType)
         return AtomicType::VaryingFloat;
     else if (t == LLVMTypes::DoubleVectorType)
         return AtomicType::VaryingDouble;
     else if (t == LLVMTypes::Int64VectorType)
-        return AtomicType::VaryingInt64;
+        return intAsUnsigned ? AtomicType::VaryingUInt64 : AtomicType::VaryingInt64;
     else if (t == LLVMTypes::Int32PointerType)
-        return new ReferenceType(AtomicType::UniformInt32, false);
+        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
+                                                 AtomicType::UniformInt32, false);
     else if (t == LLVMTypes::FloatPointerType)
         return new ReferenceType(AtomicType::UniformFloat, false);
     else if (t == LLVMTypes::Int32VectorPointerType)
-        return new ReferenceType(AtomicType::VaryingInt32, false);
+        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
+                                                 AtomicType::VaryingInt32, false);
     else if (t == LLVMTypes::FloatVectorPointerType)
         return new ReferenceType(AtomicType::VaryingFloat, false);
     else if (llvm::isa<const llvm::PointerType>(t)) {
@@ -114,9 +119,11 @@ lLLVMTypeToISPCType(const llvm::Type *t) {
         const llvm::ArrayType *at = 
             llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
         if (at && at->getNumElements() == 0 &&
-            at->getElementType() == LLVMTypes::Int32Type)
-            return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0),
-                                     false);
+            at->getElementType() == LLVMTypes::Int32Type) {
+            const Type *eltType = intAsUnsigned ? AtomicType::UniformUInt32 :
+                                                  AtomicType::UniformInt32;
+            return new ReferenceType(new ArrayType(eltType, 0), false);
+        }
     }
 
     return NULL;
@@ -135,26 +142,43 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
     const llvm::FunctionType *ftype = func->getFunctionType();
     std::string name = func->getName();
 
-    const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType());
-    if (!returnType)
-        // return type not representable in ispc -> not callable from ispc
-        return false;
+    // If the function has any parameters with integer types, we'll make
+    // two Symbols for two overloaded versions of the function, one with
+    // all of the integer types treated as signed integers and one with all
+    // of them treated as unsigned.
+    for (int i = 0; i < 2; ++i) {
+        bool intAsUnsigned = (i == 1);
 
-    // Iterate over the arguments and try to find their equivalent ispc
-    // types.
-    std::vector<const Type *> argTypes;
-    for (unsigned int i = 0; i < ftype->getNumParams(); ++i) {
-        const llvm::Type *llvmArgType = ftype->getParamType(i);
-        const Type *type = lLLVMTypeToISPCType(llvmArgType);
-        if (type == NULL)
+        const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType(),
+                                                     intAsUnsigned);
+        if (!returnType)
+            // return type not representable in ispc -> not callable from ispc
             return false;
-        argTypes.push_back(type);
+
+        // Iterate over the arguments and try to find their equivalent ispc
+        // types.  Track if any of the arguments has an integer type.
+        bool anyIntArgs = false;
+        std::vector<const Type *> argTypes;
+        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
+            const llvm::Type *llvmArgType = ftype->getParamType(j);
+            const Type *type = lLLVMTypeToISPCType(llvmArgType, intAsUnsigned);
+            if (type == NULL)
+                return false;
+            anyIntArgs |= 
+                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
+            argTypes.push_back(type);
+        }
+
+        // Always create the symbol the first time through, in particular
+        // so that we get symbols for things with no integer types!
+        if (i == 0 || anyIntArgs == true) {
+            FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
+            Symbol *sym = new Symbol(name, noPos, funcType);
+            sym->function = func;
+            symbolTable->AddFunction(sym);
+        }
     }
 
-    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
-    Symbol *sym = new Symbol(name, noPos, funcType);
-    sym->function = func;
-    symbolTable->AddFunction(sym);
     return true;
 }
 
diff --git a/ctx.cpp b/ctx.cpp
index c18fce2b..9548a777 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -695,7 +695,8 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     // Call the target-dependent movmsk function to turn the vector mask
     // into an i32 value
     std::vector<Symbol *> *mm = m->symbolTable->LookupFunction("__movmsk");
-    assert(mm && mm->size() == 1);
+    // There should be one with signed int signature, one unsigned int.
+    assert(mm && mm->size() == 2); 
     llvm::Function *fmm = (*mm)[0]->function;
     return CallInst(fmm, v, "val_movmsk");
 }
diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt
index d2aef0a8..f89fa4f8 100644
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -1,5 +1,11 @@
 === v1.0.3 === (not yet released)
 
+There are now both 'signed' and 'unsigned' variants of the standard library
+functions like packed_load_active() that that references to arrays of
+signed int32s and unsigned int32s respectively.  (The
+{load_from,store_to}_{int8,int16}() functions have similarly been augmented
+to have both 'signed' and 'unsigned' variants.)
+
 In initializer expressions with variable declarations, it is no longer
 legal to initialize arrays and structs with single scalar values that then
 initialize their members; they now must be initialized with initializer
diff --git a/docs/ispc.txt b/docs/ispc.txt
index 1106b62f..eb2dc50e 100644
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1777,24 +1777,31 @@ Packed Load and Store Operations
 --------------------------------
 
 The standard library also offers routines for writing out and reading in
-values from linear memory locations for the active program instances.
-``packed_load_active()`` loads consecutive values from the given array,
-starting at ``a[offset]``, loading one value for each currently-executing
-program instance and storing it into that program instance's ``val``
-variable.  It returns the total number of values loaded.  Similarly,
-``packed_store_active()`` stores the ``val`` values for each program
-instances that executed the ``packed_store_active()`` call, storing the
-results into the given array starting at the given offset.  It returns the
-total number of values stored.
+values from linear memory locations for the active program instances.  The
+``packed_load_active()`` functions load consecutive values from the given
+array, starting at ``a[offset]``, loading one value for each
+currently-executing program instance and storing it into that program
+instance's ``val`` variable.  They return the total number of values
+loaded.  Similarly, the ``packed_store_active()`` functions store the
+``val`` values for each program instances that executed the
+``packed_store_active()`` call, storing the results into the given array
+starting at the given offset.  They return the total number of values
+stored.
 
 ::
 
-    uniform unsigned int packed_load_active(uniform int a[],
-                                            uniform int offset,
-                                            reference int val)
-    uniform unsigned int packed_store_active(uniform int a[],
-                                             uniform int offset,
-                                             int val)
+    uniform int packed_load_active(uniform int a[],
+                                   uniform int offset,
+                                   reference int val)
+    uniform int packed_load_active(uniform unsigned int a[],
+                                   uniform int offset,
+                                   reference unsigned int val)
+    uniform int packed_store_active(uniform int a[],
+                                    uniform int offset,
+                                    int val)
+    uniform int packed_store_active(uniform unsigned int a[],
+                                    uniform int offset,
+                                    unsigned int val)
 
 
 As an example of how these functions can be used, the following code shows
@@ -1845,24 +1852,31 @@ and this conversion step are necessary because ``ispc`` doesn't have native
 
 ::
 
-    unsigned int load_from_int8(uniform int a[],
+    int load_from_int8(uniform int a[], uniform int offset)
+    unsigned int load_from_int8(uniform unsigned int a[],
                                 uniform int offset)
     void store_to_int8(uniform int a[], uniform int offset, 
+                       int val)
+    void store_to_int8(uniform unsigned int a[], uniform int offset, 
                        unsigned int val)
     unsigned int load_from_int16(uniform int a[],
                                  uniform int offset)
+    unsigned unsigned int load_from_int16(uniform unsigned int a[],
+                                          uniform int offset)
     void store_to_int16(uniform int a[], uniform int offset, 
+                        int val)
+    void store_to_int16(uniform unsigned int a[], uniform int offset, 
                         unsigned int val)
 
 There are three things to note in these functions.  First, note that these
-functions take ``unsigned int`` arrays as parameters; you need
-to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to
-``unsigned int`` when passing them to ``ispc`` code.  Second, although the
-arrays are passed as ``unsigned int``, in the array indexing calculation,
-with the ``offset`` parameter, they are treated as if they were ``int8`` or
-``int16`` types.  (i.e. the offset treated as being in terms of number of 8
-or 16-bit elements.) Third, note that programIndex is implicitly added
-to offset.
+functions take either ``int`` or ``unsigned int`` arrays as parameters; you
+need to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side
+to ``int`` or ``unsigned int`` when passing them to ``ispc`` code.  Second,
+although the arrays are passed as 32-bit integers, in the array indexing
+calculation, with the ``offset`` parameter, they are treated as if they
+were ``int8`` or ``int16`` types (i.e. the offset treated as being in terms
+of number of 8 or 16-bit elements).  Third, note that the value of
+``programIndex`` is implicitly added to offset.
 
 The ``intbits()`` and ``floatbits()`` functions can be used to implement
 low-level floating-point bit twiddling.  For example, ``intbits()`` returns
diff --git a/stdlib.ispc b/stdlib.ispc
index 8ba5410b..a775c680 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -181,7 +181,7 @@ static inline uniform bool all(bool v) {
     // As with any(), we need to explicitly mask v with the current program mask
     // so we're only looking at the current lanes
     bool match = ((v & __mask) == __mask);
-    return __movmsk(match) == (1 << programCount) - 1;
+    return __movmsk((int)match) == (1 << programCount) - 1;
 }
 
 static inline uniform int popcnt(uniform int v) {
@@ -273,35 +273,71 @@ static inline uniform unsigned int reduce_max(unsigned int v) {
 ///////////////////////////////////////////////////////////////////////////
 // packed load, store
 
-static inline uniform unsigned int packed_load_active(uniform int a[], uniform int start,
-                                               reference int vals) {
+static inline uniform int 
+packed_load_active(uniform unsigned int a[], uniform int start,
+                   reference unsigned int vals) {
     return __packed_load_active(a, start, vals, __mask);
 }
 
-static inline uniform unsigned int packed_store_active(uniform int a[], uniform int start,
-                                                int vals) {
+static inline uniform int
+packed_store_active(uniform unsigned int a[], uniform int start,
+                    unsigned int vals) {
+    return __packed_store_active(a, start, vals, __mask);
+}
+
+static inline uniform int packed_load_active(uniform int a[], uniform int start,
+                                             reference int vals) {
+    return __packed_load_active(a, start, vals, __mask);
+}
+
+static inline uniform int packed_store_active(uniform int a[], uniform int start,
+                                              int vals) {
     return __packed_store_active(a, start, vals, __mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
 // Load/store from/to 8/16-bit types
 
-static inline unsigned int load_from_int8(uniform int a[], uniform int offset) {
-    return __load_uint8(a, offset);
+static inline int load_from_int8(uniform int a[], uniform int offset) {
+    return __load_int8(a, offset, __mask);
+}
+
+static inline unsigned int load_from_uint8(uniform unsigned int a[], 
+                                           uniform int offset) {
+    return __load_uint8(a, offset, __mask);
 }
 
 static inline void store_to_int8(uniform int a[], uniform int offset, 
-                          unsigned int val) {
-    __store_uint8(a, offset, val, __mask);
+                                 unsigned int val) {
+    __store_int8(a, offset, val, __mask);
 }
 
-static inline unsigned int load_from_int16(uniform int a[], uniform int offset) {
-    return __load_uint16(a, offset);
+static inline void store_to_uint8(uniform unsigned int a[], uniform int offset, 
+                                  unsigned int val) {
+    // Can use __store_int8 for unsigned stuff, since it truncates bits in
+    // either case.
+    __store_int8(a, offset, val, __mask);
+}
+
+static inline int load_from_int16(uniform int a[], uniform int offset) {
+    return __load_int16(a, offset, __mask);
+}
+
+static inline unsigned int load_from_int16(uniform unsigned int a[], 
+                                           uniform int offset) {
+    return __load_uint16(a, offset, __mask);
 }
 
 static inline void store_to_int16(uniform int a[], uniform int offset, 
-                           unsigned int val) {
-    __store_uint16(a, offset, val, __mask);
+                                  int val) {
+    __store_int16(a, offset, val, __mask);
+}
+
+static inline void store_to_uint16(uniform unsigned int a[], uniform int offset, 
+                                   unsigned int val) {
+    // Can use __store_int16 for unsigned stuff, since it truncates bits in
+    // either case.
+    __store_int16(a, offset, val, __mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
diff --git a/stdlib.m4 b/stdlib.m4
index 82cb471c..30c8c497 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -557,33 +557,101 @@ define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline
 ;; $1: vector width of the target
 
 define(`int8_16', `
-define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset) nounwind alwaysinline {
+define internal <$1 x i32> @__load_uint8([0 x i32] *, i32 %offset,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %doload, label %skip
+
+doload:  
   %ptr8 = bitcast [0 x i32] *%0 to i8 *
   %ptr = getelementptr i8 * %ptr8, i32 %offset
   %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
   %val = load i`'eval(8*$1) * %ptr64, align 1
 
   %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
-  ; were assuming unsigned, so zero-extend to i32... 
+  ; unsigned, so zero-extend to i32... 
   %ret = zext <$1 x i8> %vval to <$1 x i32>
   ret <$1 x i32> %ret
+
+skip:
+  ret <$1 x i32> undef
 }
 
 
-define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alwaysinline {
+define internal <$1 x i32> @__load_int8([0 x i32] *, i32 %offset,
+                                        <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %doload, label %skip
+
+doload:  
+  %ptr8 = bitcast [0 x i32] *%0 to i8 *
+  %ptr = getelementptr i8 * %ptr8, i32 %offset
+  %ptr64 = bitcast i8 * %ptr to i`'eval(8*$1) *
+  %val = load i`'eval(8*$1) * %ptr64, align 1
+
+  %vval = bitcast i`'eval(8*$1) %val to <$1 x i8>
+  ; signed, so sign-extend to i32... 
+  %ret = sext <$1 x i8> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+
+skip:
+  ret <$1 x i32> undef
+}
+
+
+define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset,
+                                          <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %doload, label %skip
+
+doload:  
   %ptr16 = bitcast [0 x i32] *%0 to i16 *
   %ptr = getelementptr i16 * %ptr16, i32 %offset
   %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
   %val = load i`'eval(16*$1) * %ptr64, align 2
 
   %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
-  ; unsigned, so use zero-extent...
+  ; unsigned, so use zero-extend...
   %ret = zext <$1 x i16> %vval to <$1 x i32>
   ret <$1 x i32> %ret
+
+skip:
+  ret <$1 x i32> undef
 }
 
-define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
-                                    <$1 x i32> %mask) nounwind alwaysinline {
+
+define internal <$1 x i32> @__load_int16([0 x i32] *, i32 %offset,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %doload, label %skip
+
+doload:  
+  %ptr16 = bitcast [0 x i32] *%0 to i16 *
+  %ptr = getelementptr i16 * %ptr16, i32 %offset
+  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
+  %val = load i`'eval(16*$1) * %ptr64, align 2
+
+  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
+  ; signed, so use sign-extend...
+  %ret = sext <$1 x i16> %vval to <$1 x i32>
+  ret <$1 x i32> %ret
+
+skip:
+  ret <$1 x i32> undef
+}
+
+
+define internal void @__store_int8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                   <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %dostore, label %skip
+
+dostore:  
   %val = trunc <$1 x i32> %val32 to <$1 x i8>
   %val64 = bitcast <$1 x i8> %val to i`'eval(8*$1)
 
@@ -604,10 +672,18 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
   store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
 
   ret void
+
+skip:
+  ret void
 }
 
-define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
-                                     <$1 x i32> %mask) nounwind alwaysinline {
+define internal void @__store_int16([0 x i32] *, i32 %offset, <$1 x i32> %val32,
+                                    <$1 x i32> %mask) nounwind alwaysinline {
+  %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  %any = icmp ne i32 %mm, 0
+  br i1 %any, label %dostore, label %skip
+
+dostore:
   %val = trunc <$1 x i32> %val32 to <$1 x i16>
   %val64 = bitcast <$1 x i16> %val to i`'eval(16*$1)
 
@@ -627,6 +703,9 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
   store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
 
   ret void
+
+skip:
+  ret void
 }
 '
 )
diff --git a/tests/packed-load-1.ispc b/tests/packed-load-1.ispc
index e8dee003..d3ce0c1d 100644
--- a/tests/packed-load-1.ispc
+++ b/tests/packed-load-1.ispc
@@ -2,9 +2,9 @@
 export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
-    uniform int a[programCount];
+    uniform unsigned int a[programCount];
     a[programIndex] = aFOO[programIndex];
-    int aa;
+    unsigned int aa;
     packed_load_active(a, 0, aa);
     RET[programIndex] = aa;
 }
diff --git a/tests/packed-store.ispc b/tests/packed-store.ispc
index bd8f1297..3c41f7d7 100644
--- a/tests/packed-store.ispc
+++ b/tests/packed-store.ispc
@@ -3,10 +3,10 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform int pack[programCount];
+    uniform unsigned int pack[programCount];
     for (uniform int i = 0; i < programCount; ++i)
         pack[i] = 0;
-    packed_store_active(pack, 0, a);
+    packed_store_active(pack, 0, (unsigned int)a);
     RET[programIndex] = pack[programIndex]; 
 }
 
diff --git a/tests/shuffle2.ispc b/tests/shuffle2.ispc
new file mode 100644
index 00000000..8acbbf51
--- /dev/null
+++ b/tests/shuffle2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 aa = aFOO[programIndex]; 
+    int32 bb = aa + programCount;
+    int32 shuf = shuffle(aa, bb, 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
diff --git a/tests/store-int16-1.ispc b/tests/store-int16-1.ispc
new file mode 100644
index 00000000..aa3eb36a
--- /dev/null
+++ b/tests/store-int16-1.ispc
@@ -0,0 +1,16 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform int x[16];
+    for (uniform int i = 0; i < 16; ++i)
+        x[i] = 0xffffffff;
+    unsigned int val = aFOO[programIndex];
+    store_to_int16(x, 5, val);
+    unsigned int v = load_from_int16(x, 6);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+    RET[programCount-1] = (unsigned int)0xffffffff;
+}
diff --git a/tests/store-int16.ispc b/tests/store-int16.ispc
index 0659cdc4..77c8d902 100644
--- a/tests/store-int16.ispc
+++ b/tests/store-int16.ispc
@@ -6,11 +6,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         x[i] = 0xffffffff;
     unsigned int val = aFOO[programIndex];
     store_to_int16(x, 5, val);
-    unsigned int v = load_from_int16(x, 6);
+    int v = load_from_int16(x, 6);
     RET[programIndex] = v;
 }
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 2+programIndex;
-    RET[programCount-1] = 0xffff;
+    RET[programCount-1] = -1;
 }
diff --git a/tests/store-int8-1.ispc b/tests/store-int8-1.ispc
new file mode 100644
index 00000000..b07c64d0
--- /dev/null
+++ b/tests/store-int8-1.ispc
@@ -0,0 +1,16 @@
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    uniform unsigned int x[8];
+    for (uniform int i = 0; i < 8; ++i)
+        x[i] = 0xffffffff;
+    unsigned int val = aFOO[programIndex];
+    store_to_uint8(x, 2, val);
+    unsigned int v = load_from_uint8(x, 1);
+    RET[programIndex] = v;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+    RET[0] = (unsigned int)0xff;
+}
diff --git a/tests/store-int8.ispc b/tests/store-int8.ispc
index 6a3602e8..db3d9c8f 100644
--- a/tests/store-int8.ispc
+++ b/tests/store-int8.ispc
@@ -6,11 +6,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
         x[i] = 0xffffffff;
     unsigned int val = aFOO[programIndex];
     store_to_int8(x, 2, val);
-    unsigned int v = load_from_int8(x, 1);
+    int v = load_from_int8(x, 1);
     RET[programIndex] = v;
 }
 
 export void result(uniform float RET[]) {
     RET[programIndex] = programIndex;
-    RET[0] = 0xff;
+    RET[0] = -1.;
 }