diff --git a/builtins.cpp b/builtins.cpp
index 1f1b5ca2..ffb05e6d 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -114,61 +114,39 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
 
     // pointers to uniform
     else if (t == LLVMTypes::Int8PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
-                                                 AtomicType::UniformInt8, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
+                                       AtomicType::UniformInt8);
     else if (t == LLVMTypes::Int16PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
-                                                 AtomicType::UniformInt16, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
+                                       AtomicType::UniformInt16);
     else if (t == LLVMTypes::Int32PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
-                                                 AtomicType::UniformInt32, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
+                                       AtomicType::UniformInt32);
     else if (t == LLVMTypes::Int64PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
-                                                 AtomicType::UniformInt64, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
+                                       AtomicType::UniformInt64);
     else if (t == LLVMTypes::FloatPointerType)
-        return new ReferenceType(AtomicType::UniformFloat, false);
+        return PointerType::GetUniform(AtomicType::UniformFloat);
     else if (t == LLVMTypes::DoublePointerType)
-        return new ReferenceType(AtomicType::UniformDouble, false);
+        return PointerType::GetUniform(AtomicType::UniformDouble);
 
     // pointers to varying
     else if (t == LLVMTypes::Int8VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
-                                                 AtomicType::VaryingInt8, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
+                                       AtomicType::VaryingInt8);
     else if (t == LLVMTypes::Int16VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
-                                                 AtomicType::VaryingInt16, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
+                                       AtomicType::VaryingInt16);
     else if (t == LLVMTypes::Int32VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
-                                                 AtomicType::VaryingInt32, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
+                                       AtomicType::VaryingInt32);
     else if (t == LLVMTypes::Int64VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
-                                                 AtomicType::VaryingInt64, false);
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
+                                       AtomicType::VaryingInt64);
     else if (t == LLVMTypes::FloatVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingFloat, false);
+        return PointerType::GetUniform(AtomicType::VaryingFloat);
     else if (t == LLVMTypes::DoubleVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingDouble, false);
-
-    // arrays
-    else if (llvm::isa<const llvm::PointerType>(t)) {
-        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
-
-        // Is it a pointer to an unsized array of objects?  If so, then
-        // create the equivalent ispc type.  Note that it has to be a
-        // reference to an array, since ispc passes arrays to functions by
-        // reference.
-        const llvm::ArrayType *at = 
-            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
-        if (at != NULL) {
-            const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
-                                                      intAsUnsigned);
-            if (eltType == NULL)
-                return NULL;
-            // FIXME: this needs to be fixed when arrays can have 
-            // over 4G elements...
-            return new ReferenceType(new ArrayType(eltType, (int)at->getNumElements()),
-                                     false);
-        }
-    }
+        return PointerType::GetUniform(AtomicType::VaryingDouble);
 
     return NULL;
 }
@@ -184,6 +162,9 @@ lCreateSymbol(const std::string &name, const Type *returnType,
 
     FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
 
+    Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
+          funcType->GetString().c_str());
+
     Symbol *sym = new Symbol(name, noPos, funcType);
     sym->function = func;
     symbolTable->AddFunction(sym);
@@ -244,7 +225,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
 
         // Iterate over the arguments and try to find their equivalent ispc
         // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false, anyReferenceArgs = false;
+        bool anyIntArgs = false;
         std::vector<const Type *> argTypes;
         for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
             const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -256,7 +237,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
             }
             anyIntArgs |= 
                 (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
-            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
             argTypes.push_back(type);
         }
 
@@ -264,19 +244,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
         // so that we get symbols for things with no integer types!
         if (i == 0 || anyIntArgs == true)
             lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
-
-        // If there are any reference types, also make a variant of the
-        // symbol that has them as const references.  This obviously
-        // doesn't make sense for many builtins, but we'll give the stdlib
-        // the option to call one if it needs one.
-        if (anyReferenceArgs == true) {
-            for (unsigned int j = 0; j < argTypes.size(); ++j) {
-                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
-                    argTypes[j] = argTypes[j]->GetAsConstType();
-                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
-                              ftype, func, symbolTable);
-            }
-        }
     }
 
     return true;
@@ -476,62 +443,10 @@ lSetInternalFunctions(llvm::Module *module) {
         "__packed_store_active",
         "__popcnt_int32",
         "__popcnt_int64",
-        "__prefetch_read_1_uniform_bool",
-        "__prefetch_read_1_uniform_double",
-        "__prefetch_read_1_uniform_float",
-        "__prefetch_read_1_uniform_int16",
-        "__prefetch_read_1_uniform_int32",
-        "__prefetch_read_1_uniform_int64",
-        "__prefetch_read_1_uniform_int8",
-        "__prefetch_read_1_varying_bool",
-        "__prefetch_read_1_varying_double",
-        "__prefetch_read_1_varying_float",
-        "__prefetch_read_1_varying_int16",
-        "__prefetch_read_1_varying_int32",
-        "__prefetch_read_1_varying_int64",
-        "__prefetch_read_1_varying_int8",
-        "__prefetch_read_2_uniform_bool",
-        "__prefetch_read_2_uniform_double",
-        "__prefetch_read_2_uniform_float",
-        "__prefetch_read_2_uniform_int16",
-        "__prefetch_read_2_uniform_int32",
-        "__prefetch_read_2_uniform_int64",
-        "__prefetch_read_2_uniform_int8",
-        "__prefetch_read_2_varying_bool",
-        "__prefetch_read_2_varying_double",
-        "__prefetch_read_2_varying_float",
-        "__prefetch_read_2_varying_int16",
-        "__prefetch_read_2_varying_int32",
-        "__prefetch_read_2_varying_int64",
-        "__prefetch_read_2_varying_int8",
-        "__prefetch_read_3_uniform_bool",
-        "__prefetch_read_3_uniform_double",
-        "__prefetch_read_3_uniform_float",
-        "__prefetch_read_3_uniform_int16",
-        "__prefetch_read_3_uniform_int32",
-        "__prefetch_read_3_uniform_int64",
-        "__prefetch_read_3_uniform_int8",
-        "__prefetch_read_3_varying_bool",
-        "__prefetch_read_3_varying_double",
-        "__prefetch_read_3_varying_float",
-        "__prefetch_read_3_varying_int16",
-        "__prefetch_read_3_varying_int32",
-        "__prefetch_read_3_varying_int64",
-        "__prefetch_read_3_varying_int8",
-        "__prefetch_read_nt_uniform_bool",
-        "__prefetch_read_nt_uniform_double",
-        "__prefetch_read_nt_uniform_float",
-        "__prefetch_read_nt_uniform_int16",
-        "__prefetch_read_nt_uniform_int32",
-        "__prefetch_read_nt_uniform_int64",
-        "__prefetch_read_nt_uniform_int8",
-        "__prefetch_read_nt_varying_bool",
-        "__prefetch_read_nt_varying_double",
-        "__prefetch_read_nt_varying_float",
-        "__prefetch_read_nt_varying_int16",
-        "__prefetch_read_nt_varying_int32",
-        "__prefetch_read_nt_varying_int64",
-        "__prefetch_read_nt_varying_int8",
+        "__prefetch_read_uniform_1",
+        "__prefetch_read_uniform_2",
+        "__prefetch_read_uniform_3",
+        "__prefetch_read_uniform_nt",
         "__rcp_uniform_float",
         "__rcp_varying_float",
         "__reduce_add_double",
@@ -747,7 +662,7 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
              bool includeStdlibISPC) {
     // Add the definitions from the compiled builtins-c.c file
-    if (g->target.is32bit) {
+    if (g->target.is32Bit) {
         extern unsigned char builtins_bitcode_c_32[];
         extern int builtins_bitcode_c_32_length;
         AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
diff --git a/builtins.m4 b/builtins.m4
index affe3853..243f0de1 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -822,40 +822,6 @@ define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
 }
 ')
 
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch definitions
-
-; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
-; and data caches--the declaration is now:
-; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
-;                             i32 %cachetype)  (cachetype 1 == data cache)
-; however, the version below seems to still work...
-
-declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
-
-define(`prefetch_read', `
-define void @__prefetch_read_1_$1($2 *) alwaysinline {
-  %ptr8 = bitcast $2 * %0 to i8 *
-  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 3)
-  ret void
-}
-define void @__prefetch_read_2_$1($2 *) alwaysinline {
-  %ptr8 = bitcast $2 * %0 to i8 *
-  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 2)
-  ret void
-}
-define void @__prefetch_read_3_$1($2 *) alwaysinline {
-  %ptr8 = bitcast $2 * %0 to i8 *
-  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 1)
-  ret void
-}
-define void @__prefetch_read_nt_$1($2 *) alwaysinline {
-  %ptr8 = bitcast $2 * %0 to i8 *
-  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 0)
-  ret void
-}
-')
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 define(`stdlib_core', `
@@ -916,15 +882,25 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; converts them to native gather functions or converts them to vector
 ; loads, if equivalent.
 
-declare <$1 x i8>  @__pseudo_gather_8([$1 x i8 *], <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_16([$1 x i8 *], <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_32([$1 x i8 *], <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_64([$1 x i8 *], <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly
 
-declare <$1 x i8>  @__pseudo_gather_base_offsets_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly
+
+declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+
+declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
 
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -949,19 +925,33 @@ declare <$1 x i64> @__pseudo_gather_base_offsets_64(i8 *, <$1 x i32>, <$1 x i32>
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
 
-declare void @__pseudo_scatter_8([$1 x i8 *], <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_16([$1 x i8 *], <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_32([$1 x i8 *], <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_64([$1 x i8 *], <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind
 
-declare void @__pseudo_scatter_base_offsets_8(i8 * nocapture, <$1 x i32>,
-                                              <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_16(i8 * nocapture, <$1 x i32>,
-                                               <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_32(i8 * nocapture, <$1 x i32>,
-                                               <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_64(i8 * nocapture, <$1 x i32>,
-                                               <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind
+
+declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>,
+                                                <$1 x i8>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>,
+                                                 <$1 x i16>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>,
+                                                 <$1 x i32>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>,
+                                                 <$1 x i64>, <$1 x i32>) nounwind
+
+declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>,
+                                                <$1 x i8>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>,
+                                                 <$1 x i16>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>,
+                                                 <$1 x i32>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>,
+                                                 <$1 x i64>, <$1 x i32>) nounwind
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
@@ -1634,11 +1624,10 @@ define void
 ;; versions to be called from stdlib
 
 define void
-@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
+@__aos_to_soa4_float(float * noalias %pf, i32 %offset,
         <$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
         <$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
         nounwind alwaysinline { 
-  %pf = bitcast [0 x float] * %base to float *
   %p = getelementptr float * %pf, i32 %offset
   %p0 = bitcast float * %p to <$1 x float> *
   %v0 = load <$1 x float> * %p0, align 4
@@ -1656,16 +1645,16 @@ define void
 
 
 define void
-@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
+@__aos_to_soa4_int32(i32 * noalias %base, i32 %offset,
         <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
         <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
         nounwind alwaysinline { 
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
   %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
   %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
   %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
   %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
-  call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
+  call void @__aos_to_soa4_float(float * %fbase, i32 %offset,
       <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, 
       <$1 x float> * %fout3)
   ret void
@@ -1674,9 +1663,8 @@ define void
 
 define void
 @__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             <$1 x float> %v3, [0 x float] * noalias %base,
+             <$1 x float> %v3, float * noalias %pf,
              i32 %offset) nounwind alwaysinline { 
-  %pf = bitcast [0 x float] * %base to float *
   %p = getelementptr float * %pf, i32 %offset
   %out0 = bitcast float * %p to <$1 x float> *
   %out1 = getelementptr <$1 x float> * %out0, i32 1
@@ -1691,25 +1679,24 @@ define void
 
 define void
 @__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             <$1 x i32> %v3, [0 x i32] * noalias %base,
+             <$1 x i32> %v3, i32 * noalias %base,
              i32 %offset) nounwind alwaysinline { 
   %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
   %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
   %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
   %fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
   call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
+      <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase,
       i32 %offset)
   ret void
 }
 
 
 define void
-@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
+@__aos_to_soa3_float(float * noalias %pf, i32 %offset,
         <$1 x float> * %out0, <$1 x float> * %out1,
         <$1 x float> * %out2) nounwind alwaysinline { 
-  %pf = bitcast [0 x float] * %base to float *
   %p = getelementptr float * %pf, i32 %offset
   %p0 = bitcast float * %p to <$1 x float> *
   %v0 = load <$1 x float> * %p0, align 4
@@ -1725,14 +1712,14 @@ define void
 
 
 define void
-@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
+@__aos_to_soa3_int32(i32 * noalias %base, i32 %offset,
         <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
         <$1 x i32> * noalias %out2) nounwind alwaysinline { 
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
   %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
   %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
   %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
-  call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
+  call void @__aos_to_soa3_float(float * %fbase, i32 %offset,
       <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
   ret void
 }
@@ -1740,8 +1727,7 @@ define void
 
 define void
 @__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             [0 x float] * noalias %base, i32 %offset) nounwind alwaysinline { 
-  %pf = bitcast [0 x float] * %base to float *
+                     float * noalias %pf, i32 %offset) nounwind alwaysinline { 
   %p = getelementptr float * %pf, i32 %offset
   %out0 = bitcast float * %p to <$1 x float> *
   %out1 = getelementptr <$1 x float> * %out0, i32 1
@@ -1755,13 +1741,13 @@ define void
 
 define void
 @__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             [0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline { 
+                     i32 * noalias %base, i32 %offset) nounwind alwaysinline { 
   %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
   %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
   %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
   call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
+      <$1 x float> %fv2, float * %fbase, i32 %offset)
   ret void
 }
 
@@ -1769,21 +1755,34 @@ define void
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetching
 
-prefetch_read(uniform_bool, i1)
-prefetch_read(uniform_int8, i8)
-prefetch_read(uniform_int16, i16)
-prefetch_read(uniform_int32, i32)
-prefetch_read(uniform_int64, i64)
-prefetch_read(uniform_float, float)
-prefetch_read(uniform_double, double)
+; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
+; and data caches--the declaration is now:
+; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
+;                             i32 %cachetype)  (cachetype 1 == data cache)
+; however, the version below seems to still work...
+
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
+
+define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
+  ret void
+}
+
+define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
+  ret void
+}
+
+define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
+  ret void
+}
+
+define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
+  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
+  ret void
+}
 
-prefetch_read(varying_bool, <$1 x i32>)
-prefetch_read(varying_int8, <$1 x i8>)
-prefetch_read(varying_int16, <$1 x i16>)
-prefetch_read(varying_int32, <$1 x i32>)
-prefetch_read(varying_int64, <$1 x i64>)
-prefetch_read(varying_float, <$1 x float>)
-prefetch_read(varying_double, <$1 x double>)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; assert
@@ -2354,11 +2353,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
 
 define(`packed_load_and_store', `
 
-define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
+define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr,
                                  <$1 x i32> %full_mask) nounwind alwaysinline {
 entry:
   %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %baseptr = bitcast [0 x i32] * %0 to i32 *
   %startptr = getelementptr i32 * %baseptr, i32 %start_offset
   %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
@@ -2410,11 +2408,10 @@ done:
   ret i32 %nextoffset
 }
 
-define i32 @__packed_store_active([0 x i32] *, i32 %start_offset, <$1 x i32> %vals,
+define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals,
                                   <$1 x i32> %full_mask) nounwind alwaysinline {
 entry:
   %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
-  %baseptr = bitcast [0 x i32] * %0 to i32 *
   %startptr = getelementptr i32 * %baseptr, i32 %start_offset
   %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
   br i1 %mask_known, label %known_mask, label %unknown_mask
@@ -2686,8 +2683,8 @@ pl_done:
 define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
-define <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
-                                           i32 %lane) nounwind readonly alwaysinline {
+define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
   ; compute address for this one from the base
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
   %ptroffset = getelementptr i8 * %ptr, i32 %offset32
@@ -2699,9 +2696,22 @@ define <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret
   ret <$1 x $2> %updatedret
 }
 
+define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, <$1 x $2> %ret,
+                                    i32 %lane) nounwind readonly alwaysinline {
+  ; compute address for this one from the base
+  %offset32 = extractelement <$1 x i64> %offsets, i32 %lane
+  %ptroffset = getelementptr i8 * %ptr, i64 %offset32
+  %ptrcast = bitcast i8 * %ptroffset to $2 *
 
-define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
-                                           <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+  ; load value and insert into returned value
+  %val = load $2 *%ptrcast
+  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
+  ret <$1 x $2> %updatedret
+}
+
+
+define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
+                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
   ; We can be clever and avoid the per-lane stuff for gathers if we are willing
   ; to require that the 0th element of the array being gathered from is always
   ; legal to read from (and we do indeed require that, given the benefits!) 
@@ -2713,14 +2723,68 @@ define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                      <$1 x i32> %vecmask)
   %newOffsets = load <$1 x i32> * %offsetsPtr
 
-  %ret0 = call <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %newOffsets,
-                                          <$1 x $2> undef, i32 0)
+  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
+                                            <$1 x $2> undef, i32 0)
   forloop(lane, 1, eval($1-1), 
-          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt_$2(i8 * %ptr, 
+          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
                                 <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
                     ', `LANE', lane), `PREV', eval(lane-1))')
   ret <$1 x $2> %ret`'eval($1-1)
 }
+
+define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
+                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
+  ; to require that the 0th element of the array being gathered from is always
+  ; legal to read from (and we do indeed require that, given the benefits!) 
+  ;
+  ; Set the offset to zero for lanes that are off
+  %offsetsPtr = alloca <$1 x i64>
+  store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr
+  call void @__masked_store_blend_64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, 
+                                     <$1 x i32> %vecmask)
+  %newOffsets = load <$1 x i64> * %offsetsPtr
+
+  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
+                                            <$1 x $2> undef, i32 0)
+  forloop(lane, 1, eval($1-1), 
+          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
+                                <$1 x i64> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
+                    ', `LANE', lane), `PREV', eval(lane-1))')
+  ret <$1 x $2> %ret`'eval($1-1)
+}
+
+; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
+define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs, 
+                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <$1 x $2>
+  per_lane($1, <$1 x i32> %vecmask, `
+  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
+  %val_ID = load $2 * %ptr_ID
+  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_ID, $2 * %store_ptr_ID
+ ')
+
+  %ret = load <$1 x $2> * %ret_ptr
+  ret <$1 x $2> %ret
+}
+
+; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
+define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, 
+                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
+  %ret_ptr = alloca <$1 x $2>
+  per_lane($1, <$1 x i32> %vecmask, `
+  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
+  %val_ID = load $2 * %ptr_ID
+  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
+  store $2 %val_ID, $2 * %store_ptr_ID
+ ')
+
+  %ret = load <$1 x $2> * %ret_ptr
+  ret <$1 x $2> %ret
+}
 '
 )
 
@@ -2735,8 +2799,8 @@ define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
 define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
-define void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
-                                       i32 %lane) nounwind alwaysinline {
+define void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
   %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
   %offset64 = zext i32 %offset32 to i64
   %ptrdelta = add i64 %ptr64, %offset64
@@ -2746,13 +2810,57 @@ define void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values
   ret void
 }
 
-define void @__scatter_base_offsets_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
-                                       <$1 x i32> %mask) nounwind alwaysinline {
+define void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values,
+                                i32 %lane) nounwind alwaysinline {
+  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
+  %ptrdelta = add i64 %ptr64, %offset64
+  %ptr = inttoptr i64 %ptrdelta to $2 *
+  %storeval = extractelement <$1 x $2> %values, i32 %lane
+  store $2 %storeval, $2 * %ptr
+  ret void
+}
+
+define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
   ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
   %ptr64 = ptrtoint i8 * %base to i64
   per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
   ret void
 }
+
+define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, <$1 x $2> %values,
+                                         <$1 x i32> %mask) nounwind alwaysinline {
+  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
+  %ptr64 = ptrtoint i8 * %base to i64
+  per_lane($1, <$1 x i32> %mask, `
+      call void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, i32 LANE)')
+  ret void
+}
+
+; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
+define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
+                            <$1 x i32> %mask) nounwind alwaysinline {
+  per_lane($1, <$1 x i32> %mask, `
+  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
+  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
+  %val_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_ID, $2 * %ptr_ID
+ ')
+  ret void
+}
+
+; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
+define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values,
+                            <$1 x i32> %mask) nounwind alwaysinline {
+  per_lane($1, <$1 x i32> %mask, `
+  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
+  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
+  %val_ID = extractelement <$1 x $2> %values, i32 LANE
+  store $2 %val_ID, $2 * %ptr_ID
+ ')
+  ret void
+}
+
 '
 )
diff --git a/ctx.cpp b/ctx.cpp
index 0fd4a760..95840aab 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -239,7 +239,7 @@ FunctionEmitContext::GetInternalMask() {
     if (VaryingCFDepth() == 0)
         return LLVMMaskAllOn;
     else
-        return LoadInst(internalMaskPointer, NULL, NULL, "load_mask");
+        return LoadInst(internalMaskPointer, "load_mask");
 }
 
 
@@ -374,9 +374,8 @@ FunctionEmitContext::EndIf() {
 
             // newMask = (oldMask & ~(breakLanes | continueLanes))
             llvm::Value *oldMask = GetInternalMask();
-            llvm::Value *breakLanes = LoadInst(breakLanesPtr, NULL, NULL,
-                                               "break_lanes");
-            llvm::Value *continueLanes = LoadInst(continueLanesPtr, NULL, NULL, 
+            llvm::Value *breakLanes = LoadInst(breakLanesPtr, "break_lanes");
+            llvm::Value *continueLanes = LoadInst(continueLanesPtr,
                                                   "continue_lanes");
             llvm::Value *breakOrContinueLanes = 
                 BinaryOperator(llvm::Instruction::Or, breakLanes, continueLanes,
@@ -453,7 +452,7 @@ FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) {
     // Restore the mask to the given old mask, but leave off any lanes that
     // executed a return statement.
     // newMask = (oldMask & ~returnedLanes)
-    llvm::Value *returnedLanes = LoadInst(returnedLanesPtr, NULL, NULL,
+    llvm::Value *returnedLanes = LoadInst(returnedLanesPtr,
                                           "returned_lanes");
     llvm::Value *notReturned = NotOperator(returnedLanes, "~returned_lanes");
     llvm::Value *newMask = BinaryOperator(llvm::Instruction::And,
@@ -486,7 +485,7 @@ FunctionEmitContext::Break(bool doCoherenceCheck) {
         // breakLanes = breakLanes | mask
         assert(breakLanesPtr != NULL);
         llvm::Value *mask = GetInternalMask();
-        llvm::Value *breakMask = LoadInst(breakLanesPtr, NULL, NULL, 
+        llvm::Value *breakMask = LoadInst(breakLanesPtr,
                                           "break_mask");
         llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
                                               mask, breakMask, "mask|break_mask");
@@ -536,7 +535,7 @@ FunctionEmitContext::Continue(bool doCoherenceCheck) {
         assert(continueLanesPtr);
         llvm::Value *mask = GetInternalMask();
         llvm::Value *continueMask = 
-            LoadInst(continueLanesPtr, NULL, NULL, "continue_mask");
+            LoadInst(continueLanesPtr, "continue_mask");
         llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or,
                                               mask, continueMask, "mask|continueMask");
         StoreInst(newMask, continueLanesPtr);
@@ -580,12 +579,11 @@ FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) {
     // Check to see if (returned lanes | continued lanes | break lanes) is
     // equal to the value of mask at the start of the loop iteration.  If
     // so, everyone is done and we can jump to the given target
-    llvm::Value *returned = LoadInst(returnedLanesPtr, NULL, NULL,
+    llvm::Value *returned = LoadInst(returnedLanesPtr,
                                      "returned_lanes");
-    llvm::Value *continued = LoadInst(continueLanesPtr, NULL, NULL,
+    llvm::Value *continued = LoadInst(continueLanesPtr,
                                       "continue_lanes");
-    llvm::Value *breaked = LoadInst(breakLanesPtr, NULL, NULL,
-                                    "break_lanes");
+    llvm::Value *breaked = LoadInst(breakLanesPtr, "break_lanes");
     llvm::Value *returnedOrContinued = BinaryOperator(llvm::Instruction::Or, 
                                                       returned, continued,
                                                       "returned|continued");
@@ -619,7 +617,7 @@ FunctionEmitContext::RestoreContinuedLanes() {
 
     // mask = mask & continueFlags
     llvm::Value *mask = GetInternalMask();
-    llvm::Value *continueMask = LoadInst(continueLanesPtr, NULL, NULL,
+    llvm::Value *continueMask = LoadInst(continueLanesPtr,
                                          "continue_mask");
     llvm::Value *orMask = BinaryOperator(llvm::Instruction::Or,
                                          mask, continueMask, "mask|continue_mask");
@@ -663,7 +661,8 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
                 // in the return value memory; this preserves the return
                 // values from other lanes that may have executed return
                 // statements previously.
-                StoreInst(retVal, returnValuePtr, GetInternalMask(), returnType);
+                StoreInst(retVal, returnValuePtr, GetInternalMask(), 
+                          PointerType::GetUniform(returnType));
         }
     }
 
@@ -677,8 +676,8 @@ FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) {
     else {
         // Otherwise we update the returnedLanes value by ANDing it with
         // the current lane mask.
-        llvm::Value *oldReturnedLanes = LoadInst(returnedLanesPtr, NULL, NULL,
-                                                 "old_returned_lanes");
+        llvm::Value *oldReturnedLanes = 
+            LoadInst(returnedLanesPtr, "old_returned_lanes");
         llvm::Value *newReturnedLanes = 
             BinaryOperator(llvm::Instruction::Or, oldReturnedLanes, 
                            GetInternalMask(), "old_mask|returned_lanes");
@@ -733,7 +732,7 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
     // There should be one with signed int signature, one unsigned int.
     assert(mm && mm->size() == 2); 
     llvm::Function *fmm = (*mm)[0]->function;
-    return CallInst(fmm, AtomicType::UniformInt32, v, "val_movmsk");
+    return CallInst(fmm, NULL, v, "val_movmsk");
 }
 
 
@@ -777,6 +776,11 @@ FunctionEmitContext::CreateBasicBlock(const char *name) {
 
 llvm::Value *
 FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) {
+    if (b == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
     LLVM_TYPE_CONST llvm::ArrayType *at = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(b->getType());
     if (at) {
@@ -834,7 +838,7 @@ FunctionEmitContext::AddInstrumentationPoint(const char *note) {
     args.push_back(LaneMask(GetFullMask()));
 
     llvm::Function *finst = m->module->getFunction("ISPCInstrument");
-    CallInst(finst, AtomicType::Void, args, "");
+    CallInst(finst, NULL, args, "");
 }
 
 
@@ -952,17 +956,12 @@ lArrayVectorWidth(LLVM_TYPE_CONST llvm::Type *t) {
     if (arrayType == NULL)
         return 0;
 
-    // We shouldn't be seeing arrays of anything but vectors or pointers
-    // (for == and !=) being passed to things like
-    // FunctionEmitContext::BinaryOperator() as operands
+    // We shouldn't be seeing arrays of anything but vectors being passed
+    // to things like FunctionEmitContext::BinaryOperator() as operands.
     LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
-    LLVM_TYPE_CONST llvm::PointerType *pointerElementType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(arrayType->getElementType());
     assert((vectorElementType != NULL &&
-            (int)vectorElementType->getNumElements() == g->target.vectorWidth) ||
-           (pointerElementType != NULL &&
-            (int)arrayType->getNumElements() == g->target.vectorWidth));
+            (int)vectorElementType->getNumElements() == g->target.vectorWidth));
            
     return (int)arrayType->getNumElements();
 }
@@ -1034,9 +1033,9 @@ FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) {
 }
 
 
-// Given the llvm Type that represents an ispc VectorType (or array of
-// pointers), return an equally-shaped type with boolean elements.  (This
-// is the type that will be returned from CmpInst with ispc VectorTypes).
+// Given the llvm Type that represents an ispc VectorType, return an
+// equally-shaped type with boolean elements.  (This is the type that will
+// be returned from CmpInst with ispc VectorTypes).
 static LLVM_TYPE_CONST llvm::Type *
 lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
     LLVM_TYPE_CONST llvm::ArrayType *arrayType = 
@@ -1045,19 +1044,12 @@ lGetMatchingBoolVectorType(LLVM_TYPE_CONST llvm::Type *type) {
 
     LLVM_TYPE_CONST llvm::VectorType *vectorElementType = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(arrayType->getElementType());
-    if (vectorElementType != NULL) {
-        assert((int)vectorElementType->getNumElements() == g->target.vectorWidth);
-        LLVM_TYPE_CONST llvm::Type *base = 
-            llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
-        return llvm::ArrayType::get(base, arrayType->getNumElements());
-    }
-    else {
-        LLVM_TYPE_CONST llvm::PointerType *pointerElementType = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(arrayType->getElementType());
-        assert(pointerElementType != NULL);
-        assert((int)arrayType->getNumElements() == g->target.vectorWidth);
-        return llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
-    }
+    assert(vectorElementType != NULL);
+    assert((int)vectorElementType->getNumElements() == g->target.vectorWidth);
+
+    LLVM_TYPE_CONST llvm::Type *base = 
+        llvm::VectorType::get(LLVMTypes::BoolType, g->target.vectorWidth);
+    return llvm::ArrayType::get(base, arrayType->getNumElements());
 }
 
 
@@ -1096,7 +1088,7 @@ FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst,
 
 
 llvm::Value *
-FunctionEmitContext::SmearScalar(llvm::Value *value, const char *name) {
+FunctionEmitContext::SmearUniform(llvm::Value *value, const char *name) {
     if (value == NULL) {
         assert(m->errorCount > 0);
         return NULL;
@@ -1104,12 +1096,17 @@ FunctionEmitContext::SmearScalar(llvm::Value *value, const char *name) {
 
     llvm::Value *ret = NULL;
     LLVM_TYPE_CONST llvm::Type *eltType = value->getType();
+
     LLVM_TYPE_CONST llvm::PointerType *pt = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltType);
-    if (pt != NULL)
-        ret = llvm::UndefValue::get(llvm::ArrayType::get(eltType,
-                                                         g->target.vectorWidth));
+    if (pt != NULL) {
+        // Varying pointers are represented as vectors of i32/i64s
+        ret = llvm::UndefValue::get(LLVMTypes::VoidPointerVectorType);
+        value = PtrToIntInst(value);
+    }
     else
+        // All other varying types are represented as vectors of the
+        // underlying type.
         ret = llvm::UndefValue::get(llvm::VectorType::get(eltType,
                                                           g->target.vectorWidth));
 
@@ -1118,75 +1115,43 @@ FunctionEmitContext::SmearScalar(llvm::Value *value, const char *name) {
             llvm::Twine(i);
         ret = InsertInst(ret, value, i, n.str().c_str());
     }
+
     return ret;
 }
                                     
 
 llvm::Value *
-FunctionEmitContext::BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
+FunctionEmitContext::BitCastInst(llvm::Value *value, 
+                                 LLVM_TYPE_CONST llvm::Type *type, 
                                  const char *name) {
     if (value == NULL) {
         assert(m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
-    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
-        // If we're bitcasting an array of pointers, we have a varying
-        // lvalue; apply the corresponding bitcast to each of the
-        // individual pointers and return the result array.
-        assert((int)at->getNumElements() == g->target.vectorWidth);
-
-        llvm::Value *ret = 
-            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            llvm::Value *elt = ExtractInst(value, i);
-            llvm::Value *bc = BitCastInst(elt, type, name);
-            ret = InsertInst(ret, bc, i);
-        }
-        return ret;
-    }
-    else {
-        llvm::Instruction *inst = 
-            new llvm::BitCastInst(value, type, name ? name : "bitcast", bblock);
-        AddDebugPos(inst);
-        return inst;
-    }
+    llvm::Instruction *inst = 
+        new llvm::BitCastInst(value, type, name ? name : "bitcast", bblock);
+    AddDebugPos(inst);
+    return inst;
 }
 
 
 llvm::Value *
-FunctionEmitContext::PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
-                                  const char *name) {
+FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
     if (value == NULL) {
         assert(m->errorCount > 0);
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
-    if (at && llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType())) {
-        // varying lvalue -> apply ptr to int to the individual pointers
-        assert((int)at->getNumElements() == g->target.vectorWidth);
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(value->getType()))
+        // no-op for varying pointers; they're already vectors of ints
+        return value;
 
-        llvm::Value *ret = 
-            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            llvm::Value *elt = ExtractInst(value, i);
-            llvm::Value *p2i = PtrToIntInst(elt, type, name);
-            ret = InsertInst(ret, p2i, i);
-        }
-        return ret;
-    }
-    else {
-        llvm::Instruction *inst = 
-            new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
-        AddDebugPos(inst);
-        return inst;
-    }
+    LLVM_TYPE_CONST llvm::Type *type = LLVMTypes::PointerIntType;
+    llvm::Instruction *inst = 
+        new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
+    AddDebugPos(inst);
+    return inst;
 }
 
 
@@ -1198,28 +1163,14 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type
         return NULL;
     }
 
-    LLVM_TYPE_CONST llvm::Type *valType = value->getType();
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(valType);
-    if (at != NULL) {
-        // varying lvalue -> apply int to ptr to the individual pointers
-        assert((int)at->getNumElements() == g->target.vectorWidth);
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(value->getType()))
+        // no-op for varying pointers; they're already vectors of ints
+        return value;
 
-        llvm::Value *ret = 
-            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            llvm::Value *elt = ExtractInst(value, i);
-            llvm::Value *i2p = IntToPtrInst(elt, type, name);
-            ret = InsertInst(ret, i2p, i);
-        }
-        return ret;
-    }
-    else {
-        llvm::Instruction *inst = 
-            new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
-        AddDebugPos(inst);
-        return inst;
-    }
+    llvm::Instruction *inst = 
+        new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
+    AddDebugPos(inst);
+    return inst;
 }
 
 
@@ -1240,48 +1191,6 @@ FunctionEmitContext::TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *t
 }
 
 
-llvm::Value *
-FunctionEmitContext::ArrayToVectorInst(llvm::Value *array) {
-    if (array == NULL) {
-        assert(m->errorCount > 0);
-        return NULL;
-    }
-
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(array->getType());
-    assert(at != NULL);
-
-    uint64_t count = at->getNumElements();
-    LLVM_TYPE_CONST llvm::VectorType *vt = 
-        llvm::VectorType::get(at->getElementType(), count);
-    llvm::Value *vec = llvm::UndefValue::get(vt);
-    for (uint64_t i = 0; i < count; ++i)
-        vec = InsertInst(vec, ExtractInst(array, i), i);
-    return vec;
-}
-
-
-llvm::Value *
-FunctionEmitContext::VectorToArrayInst(llvm::Value *vector) {
-    if (vector == NULL) {
-        assert(m->errorCount > 0);
-        return NULL;
-    }
-
-    LLVM_TYPE_CONST llvm::VectorType *vt = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(vector->getType());
-    assert(vt != NULL);
-
-    uint64_t count = vt->getNumElements();
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::ArrayType::get(vt->getElementType(), count);
-    llvm::Value *array = llvm::UndefValue::get(at);
-    for (uint64_t i = 0; i < count; ++i)
-        array = InsertInst(array, ExtractInst(vector, i), i);
-    return array;
-}
-
-
 llvm::Instruction *
 FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
                               LLVM_TYPE_CONST llvm::Type *type, const char *name) {
@@ -1350,27 +1259,132 @@ FunctionEmitContext::ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *ty
 }
 
 
+/** Utility routine used by the GetElementPtrInst() methods; given a
+    pointer to some type (either uniform or varying) and an index (also
+    either uniform or varying), this returns the new pointer (varying if
+    appropriate) given by offsetting the base pointer by the index times
+    the size of the object that the pointer points to.
+ */
+llvm::Value *
+FunctionEmitContext::applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
+                                     const Type *ptrType) {
+    // Find the scale factor for the index (i.e. the size of the object
+    // that the pointer(s) point(s) to.
+    const Type *scaleType = ptrType->GetBaseType();
+    llvm::Value *scale = g->target.SizeOf(scaleType->LLVMType(g->ctx));
+
+    bool indexIsVarying = 
+        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
+    llvm::Value *offset = NULL;
+    if (indexIsVarying == false) {
+        // Truncate or sign extend the index as appropriate to a 32 or
+        // 64-bit type.
+        if ((g->target.is32Bit || g->opt.force32BitAddressing) && 
+            index->getType() == LLVMTypes::Int64Type)
+            index = TruncInst(index, LLVMTypes::Int32Type, "trunc_index");
+        else if ((!g->target.is32Bit && !g->opt.force32BitAddressing) &&
+                 index->getType() == LLVMTypes::Int32Type)
+            index = SExtInst(index, LLVMTypes::Int64Type, "sext_index");
+
+        // do a scalar multiply to get the offset as index * scale and then
+        // smear the result out to be a vector; this is more efficient than
+        // first promoting both the scale and the index to vectors and then
+        // multiplying.
+        offset = BinaryOperator(llvm::Instruction::Mul, scale, index);
+        offset = SmearUniform(offset, "offset_smear");
+    }
+    else {
+        // Similarly, truncate or sign extend the index to be a 32 or 64
+        // bit vector type
+        if ((g->target.is32Bit || g->opt.force32BitAddressing) && 
+            index->getType() == LLVMTypes::Int64VectorType)
+            index = TruncInst(index, LLVMTypes::Int32VectorType, "trunc_index");
+        else if ((!g->target.is32Bit && !g->opt.force32BitAddressing) &&
+                 index->getType() == LLVMTypes::Int32VectorType)
+            index = SExtInst(index, LLVMTypes::Int64VectorType, "sext_index");
+
+        scale = SmearUniform(scale, "scale_smear");
+
+        // offset = index * scale
+        offset = BinaryOperator(llvm::Instruction::Mul, scale, index, "offset");
+    }
+
+    // For 64-bit targets, if we've been doing our offset calculations in
+    // 32 bits, we still have to convert to a 64-bit value before we
+    // actually add the offset to the pointer.
+    if (g->target.is32Bit == false && g->opt.force32BitAddressing == true)
+        offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");
+
+    // Smear out the pointer to be varying; either the base pointer or the
+    // index must be varying for this method to be called.
+    bool baseIsUniform = 
+        (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(basePtr->getType()));
+    assert(baseIsUniform == false || indexIsVarying == true);
+    llvm::Value *varyingPtr = baseIsUniform ? 
+        SmearUniform(basePtr, "ptr_smear") : basePtr;
+
+    // newPtr = ptr + offset
+    return BinaryOperator(llvm::Instruction::Add, varyingPtr, offset, "new_ptr");
+}
+
+
+llvm::Value *
+FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index, 
+                                       const Type *ptrType, const char *name) {
+    if (basePtr == NULL || index == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+
+    bool indexIsVaryingType = 
+        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index->getType());
+
+    if (indexIsVaryingType == false && ptrType->IsUniformType() == true) {
+        // The easy case: both the base pointer and the indices are
+        // uniform, so just emit the regular LLVM GEP instruction
+        llvm::Value *ind[1] = { index };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&ind[0], &ind[1]);
+        llvm::Instruction *inst = 
+            llvm::GetElementPtrInst::Create(basePtr, arrayRef,
+                                            name ? name : "gep", bblock);
+#else
+        llvm::Instruction *inst = 
+            llvm::GetElementPtrInst::Create(basePtr, &ind[0], &ind[1], 
+                                            name ? name : "gep", bblock);
+#endif
+        AddDebugPos(inst);
+        return inst;
+    }
+    else
+        return applyVaryingGEP(basePtr, index, ptrType);
+}
+
+
 llvm::Value *
 FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, 
-                                       llvm::Value *index1, const char *name) {
+                                       llvm::Value *index1, const Type *ptrType,
+                                       const char *name) {
     if (basePtr == NULL || index0 == NULL || index1 == NULL) {
         assert(m->errorCount > 0);
         return NULL;
     }
 
-    // FIXME: do we need need to handle the case of the first index being
-    // varying?  It's not currently needed...
-    assert(!llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType()));
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
 
-    LLVM_TYPE_CONST llvm::Type *basePtrType = basePtr->getType();
-    LLVM_TYPE_CONST llvm::ArrayType *baseArrayType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(basePtrType);
-    bool baseIsVaryingTypePointer = (baseArrayType != NULL) && 
-        llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(baseArrayType->getElementType());
-    bool indexIsVaryingType = 
+    bool index0IsVaryingType = 
+        llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index0->getType());
+    bool index1IsVaryingType = 
         llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(index1->getType());
 
-    if (!indexIsVaryingType && !baseIsVaryingTypePointer) {
+    if (index0IsVaryingType == false && index1IsVaryingType == false && 
+        ptrType->IsUniformType() == true) {
         // The easy case: both the base pointer and the indices are
         // uniform, so just emit the regular LLVM GEP instruction
         llvm::Value *indices[2] = { index0, index1 };
@@ -1388,148 +1402,183 @@ FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0
         return inst;
     }
     else {
-        // We have a varying pointer and/or indices; emit the appropriate
-        // GEP for each of the program instances
-        llvm::Value *lret = NULL;
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            // Get the index, either using the same one if it's uniform or
-            // the one for this lane if it's varying
-            llvm::Value *indexElt;
-            if (indexIsVaryingType)
-                indexElt = ExtractInst(index1, i, "get_array_index");
-            else
-                indexElt = index1;
+        // Handle the first dimension with index0
+        llvm::Value *ptr0 = GetElementPtrInst(basePtr, index0, ptrType);
 
-            // Similarly figure out the appropriate base pointer
-            llvm::Value *aptr;
-            if (baseIsVaryingTypePointer)
-                aptr = ExtractInst(basePtr, i, "get_array_index");
-            else
-                aptr = basePtr;
+        // Now index into the second dimension with index1.  First figure
+        // out the type of ptr0.
+        const Type *baseType = ptrType->GetBaseType();
+        const SequentialType *st = dynamic_cast<const SequentialType *>(baseType);
+        assert(st != NULL);
 
-            // Do the GEP for this lane
-            llvm::Value *eltPtr = GetElementPtrInst(aptr, index0, indexElt, name);
+        bool ptr0IsUniform = 
+            llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(ptr0->getType());
+        const Type *ptr0BaseType = st->GetElementType();
+        const Type *ptr0Type = ptr0IsUniform ?
+            PointerType::GetUniform(ptr0BaseType) : 
+            PointerType::GetVarying(ptr0BaseType);
 
-            if (lret == NULL) {
-                // This is kind of a hack: use the type from the GEP to
-                // figure out the return type and the first time through,
-                // create an undef value of that type here
-                LLVM_TYPE_CONST llvm::PointerType *elementPtrType = 
-                    llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(eltPtr->getType());
-                LLVM_TYPE_CONST llvm::Type *elementType = 
-                    elementPtrType->getElementType();
-                lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType));
-            }
-
-            // And insert the result of the GEP into the return value
-            lret = InsertInst(lret, eltPtr, i, "elt_ptr_store");
-        }
-        return lret;
+        return applyVaryingGEP(ptr0, index1, ptr0Type);
     }
 }
 
 
 llvm::Value *
-FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
-                                       const char *name) {
-    return GetElementPtrInst(basePtr, LLVMInt32(v0), LLVMInt32(v1), name);
+FunctionEmitContext::AddElementOffset(llvm::Value *basePtr, int elementNum,
+                                      const Type *ptrType, const char *name) {
+    if (ptrType == NULL || ptrType->IsUniformType() ||
+        dynamic_cast<const ReferenceType *>(ptrType) != NULL) {
+        // If the pointer is uniform or we have a reference (which is a
+        // uniform pointer in the end), we can use the regular LLVM GEP.
+        llvm::Value *offsets[2] = { LLVMInt32(0), LLVMInt32(elementNum) };
+#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
+        llvm::ArrayRef<llvm::Value *> arrayRef(&offsets[0], &offsets[2]);
+        return llvm::GetElementPtrInst::Create(basePtr, arrayRef,
+                                               name ? name : "struct_offset", bblock);
+#else
+        return llvm::GetElementPtrInst::Create(basePtr, &offsets[0], &offsets[2],
+                                               name ? name : "struct_offset", bblock);
+#endif
+
+    }
+
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+
+    // Otherwise do the math to find the offset and add it to the given
+    // varying pointers
+    const StructType *st = 
+        dynamic_cast<const StructType *>(ptrType->GetBaseType());
+    llvm::Value *offset = NULL;
+    if (st != NULL)
+        // If the pointer is to a structure, Target::StructOffset() gives
+        // us the offset in bytes to the given element of the structure
+        offset = g->target.StructOffset(st->LLVMType(g->ctx), elementNum);
+    else {
+        // Otherwise we should have a vector here and the offset is given
+        // by the element number times the size of the element type of the
+        // vector.
+        const VectorType *vt = 
+            dynamic_cast<const VectorType *>(ptrType->GetBaseType());
+        assert(vt != NULL);
+        llvm::Value *size = 
+            g->target.SizeOf(vt->GetElementType()->LLVMType(g->ctx));
+        llvm::Value *scale = (g->target.is32Bit || g->opt.force32BitAddressing) ?
+            LLVMInt32(elementNum) : LLVMInt64(elementNum);
+        offset = BinaryOperator(llvm::Instruction::Mul, size, scale);
+    }
+
+    offset = SmearUniform(offset, "offset_smear");
+
+    if (g->target.is32Bit == false && g->opt.force32BitAddressing == true)
+        // If we're doing 32 bit addressing with a 64 bit target, although
+        // we did the math above in 32 bit, we need to go to 64 bit before
+        // we add the offset to the varying pointers.
+        offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");
+
+    return BinaryOperator(llvm::Instruction::Add, basePtr, offset, 
+                          "struct_ptr_offset");
 }
     
 
 llvm::Value *
-FunctionEmitContext::LoadInst(llvm::Value *lvalue, llvm::Value *mask,
-                              const Type *type, const char *name) {
-    if (lvalue == NULL) {
+FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
+    if (ptr == NULL) {
         assert(m->errorCount > 0);
         return NULL;
     }
 
-    if (llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType())) {
-        // If the lvalue is a straight up regular pointer, then just issue
+    LLVM_TYPE_CONST llvm::PointerType *pt = 
+        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ptr->getType());
+    assert(pt != NULL);
+
+    // FIXME: it's not clear to me that we generate unaligned vector loads
+    // of varying stuff out of the front-end any more.  (Only by the
+    // optimization passes that lower gathers to vector loads, I think..)
+    // So remove this??
+    int align = 0;
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(pt->getElementType()))
+        align = 1;
+    llvm::Instruction *inst = new llvm::LoadInst(ptr, name ? name : "load",
+                                                 false /* not volatile */,
+                                                 align, bblock);
+    AddDebugPos(inst);
+    return inst;
+}
+
+
+llvm::Value *
+FunctionEmitContext::LoadInst(llvm::Value *ptr, llvm::Value *mask,
+                              const Type *ptrType, const char *name) {
+    if (ptr == NULL) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
+    assert(ptrType != NULL && mask != NULL);
+
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
+
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+
+    if (ptrType->IsUniformType()) {
+        // FIXME: same issue as above load inst regarding alignment...
+        //
+        // If the ptr is a straight up regular pointer, then just issue
         // a regular load.  First figure out the alignment; in general we
         // can just assume the natural alignment (0 here), but for varying
         // atomic types, we need to make sure that the compiler emits
         // unaligned vector loads, so we specify a reduced alignment here.
         int align = 0;
-        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+        const AtomicType *atomicType = 
+            dynamic_cast<const AtomicType *>(ptrType->GetBaseType());
         if (atomicType != NULL && atomicType->IsVaryingType())
             // We actually just want to align to the vector element
             // alignment, but can't easily get that here, so just tell LLVM
             // it's totally unaligned.  (This shouldn't make any difference
             // vs the proper alignment in practice.)
             align = 1;
-        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load",
+        llvm::Instruction *inst = new llvm::LoadInst(ptr, name ? name : "load",
                                                      false /* not volatile */,
                                                      align, bblock);
         AddDebugPos(inst);
         return inst;
     }
     else {
-        // Otherwise we should have a varying lvalue and it's time for a
-        // gather.  The "type" parameter only has to be non-NULL for the
-        // gather path here (we can't reliably figure out all of the type
-        // information we need from the LLVM::Type, so have to carry the
-        // ispc type in through this path..
-        assert(type != NULL && mask != NULL);
-        assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
-        return gather(lvalue, mask, type, name);
+        // Otherwise we should have a varying ptr and it's time for a
+        // gather.
+        return gather(ptr, ptrType, mask, name);
     }
 }
 
 
 llvm::Value *
-FunctionEmitContext::gather(llvm::Value *lvalue, llvm::Value *mask,
-                            const Type *type, const char *name) {
+FunctionEmitContext::gather(llvm::Value *ptr, const Type *ptrType, 
+                            llvm::Value *mask, const char *name) {
     // We should have a varying lvalue if we get here...
-    assert(llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
+    assert(ptrType->IsVaryingType() &&
+           ptr->getType() == LLVMTypes::VoidPointerVectorType);
 
-    LLVM_TYPE_CONST llvm::Type *retType = type->LLVMType(g->ctx);
+    const Type *returnType = ptrType->GetBaseType()->GetAsVaryingType();
+    LLVM_TYPE_CONST llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
+
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(ptrType->GetBaseType());
+    if (collectionType != NULL) {
+        // For collections, recursively gather element wise to find the
+        // result.
+        llvm::Value *retValue = llvm::UndefValue::get(llvmReturnType);
+        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
+            llvm::Value *eltPtr = AddElementOffset(ptr, i, ptrType);
+            const Type *eltPtrType = 
+                PointerType::GetVarying(collectionType->GetElementType(i));
+            eltPtr = addVaryingOffsetsIfNeeded(eltPtr, eltPtrType);
 
-    const StructType *st = dynamic_cast<const StructType *>(type);
-    if (st) {
-        // If we're gathering structures, do an element-wise gather
-        // recursively.
-        llvm::Value *retValue = llvm::UndefValue::get(retType);
-        for (int i = 0; i < st->GetElementCount(); ++i) {
-            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
             // This in turn will be another gather
-            llvm::Value *eltValues = 
-                LoadInst(eltPtrs, mask, st->GetElementType(i), name);
-            retValue = InsertInst(retValue, eltValues, i, "set_value");
-        }
-        return retValue;
-    }
+            llvm::Value *eltValues = LoadInst(eltPtr, mask, eltPtrType, name);
 
-    const VectorType *vt = dynamic_cast<const VectorType *>(type);
-    if (vt) {
-        // Similarly, if it's a vector type, do a gather for each of the
-        // vector elements
-        llvm::Value *retValue = llvm::UndefValue::get(retType);
-        // FIXME: yuck.  Change lvalues to be pointers to arrays so that
-        // the GEP stuff in the loop below ends up computing pointers based
-        // on elements in the vectors rather than incorrectly advancing to
-        // the next vector...
-        LLVM_TYPE_CONST llvm::Type *eltType = 
-            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
-        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
-
-        for (int i = 0; i < vt->GetElementCount(); ++i) {
-            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
-            llvm::Value *eltValues = LoadInst(eltPtrs, mask, vt->GetBaseType(), 
-                                              name);
-            retValue = InsertInst(retValue, eltValues, i, "set_value");
-        }
-        return retValue;
-    }
-
-    const ArrayType *at = dynamic_cast<const ArrayType *>(type);
-    if (at) {
-        // Arrays are also handled recursively and element-wise
-        llvm::Value *retValue = llvm::UndefValue::get(retType);
-        for (int i = 0; i < at->GetElementCount(); ++i) {
-            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
-            llvm::Value *eltValues = LoadInst(eltPtrs, mask, 
-                                              at->GetElementType(), name);
             retValue = InsertInst(retValue, eltValues, i, "set_value");
         }
         return retValue;
@@ -1539,48 +1588,41 @@ FunctionEmitContext::gather(llvm::Value *lvalue, llvm::Value *mask,
     // can go and do the actual gather
     AddInstrumentationPoint("gather");
 
-    llvm::Function *gather = NULL;
     // Figure out which gather function to call based on the size of
     // the elements.
-    const PointerType *pt = dynamic_cast<const PointerType *>(type);
-    if (pt != NULL) {
-        if (g->target.is32bit)
-            gather = m->module->getFunction("__pseudo_gather_32");
-        else
-            gather = m->module->getFunction("__pseudo_gather_64");
-    }
-    else if (retType == LLVMTypes::DoubleVectorType || 
-             retType == LLVMTypes::Int64VectorType)
-        gather = m->module->getFunction("__pseudo_gather_64");
-    else if (retType == LLVMTypes::FloatVectorType || 
-             retType == LLVMTypes::Int32VectorType)
-        gather = m->module->getFunction("__pseudo_gather_32");
-    else if (retType == LLVMTypes::Int16VectorType)
-        gather = m->module->getFunction("__pseudo_gather_16");
+    const PointerType *pt = dynamic_cast<const PointerType *>(returnType);
+    const char *funcName = NULL;
+    if (pt != NULL)
+        funcName = g->target.is32Bit ? "__pseudo_gather32_32" : 
+            "__pseudo_gather64_64";
+    else if (llvmReturnType == LLVMTypes::DoubleVectorType || 
+             llvmReturnType == LLVMTypes::Int64VectorType)
+        funcName = g->target.is32Bit ? "__pseudo_gather32_64" : 
+            "__pseudo_gather64_64";
+    else if (llvmReturnType == LLVMTypes::FloatVectorType || 
+             llvmReturnType == LLVMTypes::Int32VectorType)
+        funcName = g->target.is32Bit ? "__pseudo_gather32_32" : 
+            "__pseudo_gather64_32";
+    else if (llvmReturnType == LLVMTypes::Int16VectorType)
+        funcName = g->target.is32Bit ? "__pseudo_gather32_16" : 
+            "__pseudo_gather64_16";
     else {
-        assert(retType == LLVMTypes::Int8VectorType);
-        gather = m->module->getFunction("__pseudo_gather_8");
+        assert(llvmReturnType == LLVMTypes::Int8VectorType);
+        funcName = g->target.is32Bit ? "__pseudo_gather32_8" : 
+            "__pseudo_gather64_8";
     }
-    assert(gather != NULL);
 
-    lvalue = addVaryingOffsetsIfNeeded(lvalue, type);
+    llvm::Function *gatherFunc = m->module->getFunction(funcName);
+    assert(gatherFunc != NULL);
 
-    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
-    llvm::Value *call = CallInst(gather, type, voidlvalue, mask, name);
+    llvm::Value *call = CallInst(gatherFunc, NULL, ptr, mask, name);
 
     // Add metadata about the source file location so that the
     // optimization passes can print useful performance warnings if we
     // can't optimize out this gather
     addGSMetadata(call, currentPos);
 
-    if (pt != NULL) {
-        LLVM_TYPE_CONST llvm::Type *ptrType = 
-            pt->GetAsUniformType()->LLVMType(g->ctx);
-        return IntToPtrInst(VectorToArrayInst(call), ptrType,
-                            "gather_bitcast");
-    }
-    else
-        return BitCastInst(call, retType, "gather_bitcast");
+    return BitCastInst(call, llvmReturnType, "gather_bitcast");
 }
 
 
@@ -1617,8 +1659,9 @@ FunctionEmitContext::addGSMetadata(llvm::Value *v, SourcePos pos) {
 
 
 llvm::Value *
-FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char *name,
-                                int align, bool atEntryBlock) {
+FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, 
+                                const char *name, int align, 
+                                bool atEntryBlock) {
     llvm::AllocaInst *inst = NULL;
     if (atEntryBlock) {
         // We usually insert it right before the jump instruction at the
@@ -1657,91 +1700,89 @@ FunctionEmitContext::AllocaInst(LLVM_TYPE_CONST llvm::Type *llvmType, const char
     instance (that case is handled by scatters).
  */
 void
-FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
-                                 const Type *rvalueType, 
-                                 llvm::Value *storeMask) {
-    if (rvalue == NULL || lvalue == NULL) {
+FunctionEmitContext::maskedStore(llvm::Value *value, llvm::Value *ptr,
+                                 const Type *ptrType, llvm::Value *mask) {
+    if (value == NULL || ptr == NULL) {
         assert(m->errorCount > 0);
         return;
     }
 
-    assert(llvm::isa<LLVM_TYPE_CONST llvm::PointerType>(lvalue->getType()));
-    
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    assert(ptrType->IsUniformType());
+
+    const Type *valueType = ptrType->GetBaseType();
     const CollectionType *collectionType = 
-        dynamic_cast<const CollectionType *>(rvalueType);
+        dynamic_cast<const CollectionType *>(valueType);
     if (collectionType != NULL) {
         // Assigning a structure / array / vector. Handle each element
         // individually with what turns into a recursive call to
         // makedStore()
         for (int i = 0; i < collectionType->GetElementCount(); ++i) {
-            llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member");
-            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, 
-                                                       "struct_lvalue_ptr");
-            StoreInst(eltValue, eltLValue, storeMask, 
-                      collectionType->GetElementType(i));
+            llvm::Value *eltValue = ExtractInst(value, i, "value_member");
+            llvm::Value *eltPtr = 
+                AddElementOffset(ptr, i, ptrType, "struct_ptr_ptr");
+            const Type *eltPtrType = 
+                PointerType::GetUniform(collectionType->GetElementType(i));
+            StoreInst(eltValue, eltPtr, mask, eltPtrType);
         }
         return;
     }
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(rvalueType);
-    if (pt != NULL) {
-        if (g->target.is32bit) {
-            rvalue = PtrToIntInst(rvalue, LLVMTypes::Int32Type, "ptr2int");
-            rvalueType = AtomicType::VaryingInt32;
-        }
-        else {
-            rvalue = PtrToIntInst(rvalue, LLVMTypes::Int64Type, "ptr2int");
-            rvalueType = AtomicType::VaryingInt64;
-        }
-        rvalue = ArrayToVectorInst(rvalue);
-    }
-
-    // We must have a regular atomic or enumerator type at this point
-    assert(dynamic_cast<const AtomicType *>(rvalueType) != NULL ||
-           dynamic_cast<const EnumType *>(rvalueType) != NULL);
-    rvalueType = rvalueType->GetAsNonConstType();
+    // We must have a regular atomic, enumerator, or pointer type at this
+    // point.
+    assert(dynamic_cast<const AtomicType *>(valueType) != NULL ||
+           dynamic_cast<const EnumType *>(valueType) != NULL ||
+           dynamic_cast<const PointerType *>(valueType) != NULL);
+    valueType = valueType->GetAsNonConstType();
 
     llvm::Function *maskedStoreFunc = NULL;
     // Figure out if we need a 8, 16, 32 or 64-bit masked store.
-    if (rvalueType == AtomicType::VaryingDouble || 
-        rvalueType == AtomicType::VaryingInt64 ||
-        rvalueType == AtomicType::VaryingUInt64) {
+    if (dynamic_cast<const PointerType *>(valueType) != NULL) {
+        if (g->target.is32Bit)
+            maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
+        else
+            maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
+    }
+    else if (valueType == AtomicType::VaryingDouble || 
+             valueType == AtomicType::VaryingInt64 ||
+             valueType == AtomicType::VaryingUInt64) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64");
-        lvalue = BitCastInst(lvalue, LLVMTypes::Int64VectorPointerType, 
-                             "lvalue_to_int64vecptr");
-        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, 
-                             "rvalue_to_int64");
+        ptr = BitCastInst(ptr, LLVMTypes::Int64VectorPointerType, 
+                             "ptr_to_int64vecptr");
+        value = BitCastInst(value, LLVMTypes::Int64VectorType, 
+                             "value_to_int64");
     }
-    else if (rvalueType == AtomicType::VaryingFloat ||
-             rvalueType == AtomicType::VaryingBool ||
-             rvalueType == AtomicType::VaryingInt32 ||
-             rvalueType == AtomicType::VaryingUInt32 ||
-             dynamic_cast<const EnumType *>(rvalueType) != NULL) {
+    else if (valueType == AtomicType::VaryingFloat ||
+             valueType == AtomicType::VaryingBool ||
+             valueType == AtomicType::VaryingInt32 ||
+             valueType == AtomicType::VaryingUInt32 ||
+             dynamic_cast<const EnumType *>(valueType) != NULL) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32");
-        lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, 
-                             "lvalue_to_int32vecptr");
-        if (rvalueType == AtomicType::VaryingFloat)
-            rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, 
-                                 "rvalue_to_int32");
+        ptr = BitCastInst(ptr, LLVMTypes::Int32VectorPointerType, 
+                             "ptr_to_int32vecptr");
+        if (valueType == AtomicType::VaryingFloat)
+            value = BitCastInst(value, LLVMTypes::Int32VectorType, 
+                                 "value_to_int32");
     }
-    else if (rvalueType == AtomicType::VaryingInt16 ||
-             rvalueType == AtomicType::VaryingUInt16) {
+    else if (valueType == AtomicType::VaryingInt16 ||
+             valueType == AtomicType::VaryingUInt16) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_16");
-        lvalue = BitCastInst(lvalue, LLVMTypes::Int16VectorPointerType, 
-                             "lvalue_to_int16vecptr");
+        ptr = BitCastInst(ptr, LLVMTypes::Int16VectorPointerType, 
+                             "ptr_to_int16vecptr");
     }
-    else if (rvalueType == AtomicType::VaryingInt8 ||
-             rvalueType == AtomicType::VaryingUInt8) {
+    else if (valueType == AtomicType::VaryingInt8 ||
+             valueType == AtomicType::VaryingUInt8) {
         maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_8");
-        lvalue = BitCastInst(lvalue, LLVMTypes::Int8VectorPointerType, 
-                             "lvalue_to_int8vecptr");
+        ptr = BitCastInst(ptr, LLVMTypes::Int8VectorPointerType, 
+                             "ptr_to_int8vecptr");
     }
+    assert(maskedStoreFunc != NULL);
 
     std::vector<llvm::Value *> args;
-    args.push_back(lvalue);
-    args.push_back(rvalue);
-    args.push_back(storeMask);
-    CallInst(maskedStoreFunc, AtomicType::Void, args);
+    args.push_back(ptr);
+    args.push_back(value);
+    args.push_back(mask);
+    CallInst(maskedStoreFunc, NULL, args);
 }
 
 
@@ -1753,143 +1794,127 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
     program instance are on.  If they're off, don't do anything.  
 */
 void
-FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
-                             llvm::Value *storeMask, const Type *rvalueType) {
-    assert(rvalueType->IsVaryingType());
-    assert(llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()));
+FunctionEmitContext::scatter(llvm::Value *value, llvm::Value *ptr, 
+                             const Type *ptrType, llvm::Value *mask) {
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    assert(ptrType->IsVaryingType());
 
-    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
-    if (structType) {
-        // Scatter the struct elements individually
-        for (int i = 0; i < structType->GetElementCount(); ++i) {
-            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
-            llvm::Value *rv = ExtractInst(rvalue, i);
-            scatter(rv, lv, storeMask, structType->GetElementType(i));
-        }
-        return;
-    }
-
-    const VectorType *vt = dynamic_cast<const VectorType *>(rvalueType);
-    if (vt) {
-        // FIXME: yuck.  Change lvalues to be pointers to arrays so that
-        // the GEP stuff in the loop below ends up computing pointers based
-        // on elements in the vectors rather than incorrectly advancing to
-        // the next vector...
-        LLVM_TYPE_CONST llvm::Type *eltType = 
-            vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx);
-        lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0));
-
-        for (int i = 0; i < vt->GetElementCount(); ++i) {
-            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
-            llvm::Value *rv = ExtractInst(rvalue, i);
-            scatter(rv, lv, storeMask, vt->GetElementType());
-        }
-        return;
-    }
+    const Type *valueType = ptrType->GetBaseType();
 
     // I think this should be impossible
-    assert(dynamic_cast<const ArrayType *>(rvalueType) == NULL);
+    assert(dynamic_cast<const ArrayType *>(valueType) == NULL);
 
-    const PointerType *pt = dynamic_cast<const PointerType *>(rvalueType);
+    const CollectionType *collectionType = dynamic_cast<const CollectionType *>(valueType);
+    if (collectionType != NULL) {
+        // Scatter the collection elements individually
+        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
+            llvm::Value *eltPtr = AddElementOffset(ptr, i, ptrType);
+            llvm::Value *eltValue = ExtractInst(value, i);
+            const Type *eltPtrType = 
+                PointerType::GetVarying(collectionType->GetElementType(i));
+            eltPtr = addVaryingOffsetsIfNeeded(eltPtr, eltPtrType);
+            scatter(eltValue, eltPtr, eltPtrType, mask);
+        }
+        return;
+    }
+
+    const PointerType *pt = dynamic_cast<const PointerType *>(valueType);
 
     // And everything should be a pointer or atomic from here on out...
     assert(pt != NULL || 
-           dynamic_cast<const AtomicType *>(rvalueType) != NULL);
+           dynamic_cast<const AtomicType *>(valueType) != NULL);
 
-    llvm::Function *func = NULL;
-    LLVM_TYPE_CONST llvm::Type *type = rvalue->getType();
-    if (pt != NULL) {
-        if (g->target.is32bit) {
-            rvalue = PtrToIntInst(rvalue, LLVMTypes::Int32Type);
-            rvalue = ArrayToVectorInst(rvalue);
-            func = m->module->getFunction("__pseudo_scatter_32");
-        }
-        else {
-            rvalue = PtrToIntInst(rvalue, LLVMTypes::Int64Type);
-            rvalue = ArrayToVectorInst(rvalue);
-            func = m->module->getFunction("__pseudo_scatter_64");
-        }
-    }
+    LLVM_TYPE_CONST llvm::Type *type = value->getType();
+    const char *funcName = NULL;
+    if (pt != NULL)
+        funcName = g->target.is32Bit ? "__pseudo_scatter32_32" :
+            "__pseudo_scatter64_64";
     else if (type == LLVMTypes::DoubleVectorType || 
-        type == LLVMTypes::Int64VectorType) {
-        func = m->module->getFunction("__pseudo_scatter_64");
-        rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int");
+             type == LLVMTypes::Int64VectorType) {
+        funcName = g->target.is32Bit ? "__pseudo_scatter32_64" :
+            "__pseudo_scatter64_64";
+        value = BitCastInst(value, LLVMTypes::Int64VectorType, "value2int");
     }
     else if (type == LLVMTypes::FloatVectorType || 
              type == LLVMTypes::Int32VectorType) {
-        func = m->module->getFunction("__pseudo_scatter_32");
-        rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int");
+        funcName = g->target.is32Bit ? "__pseudo_scatter32_32" :
+            "__pseudo_scatter64_32";
+        value = BitCastInst(value, LLVMTypes::Int32VectorType, "value2int");
     }
     else if (type == LLVMTypes::Int16VectorType)
-        func = m->module->getFunction("__pseudo_scatter_16");
+        funcName = g->target.is32Bit ? "__pseudo_scatter32_16" :
+            "__pseudo_scatter64_16";
     else if (type == LLVMTypes::Int8VectorType)
-        func = m->module->getFunction("__pseudo_scatter_8");
-    assert(func != NULL);
+        funcName = g->target.is32Bit ? "__pseudo_scatter32_8" :
+            "__pseudo_scatter64_8";
+
+    llvm::Function *scatterFunc = m->module->getFunction(funcName);
+    assert(scatterFunc != NULL);
     
     AddInstrumentationPoint("scatter");
 
-    lvalue = addVaryingOffsetsIfNeeded(lvalue, rvalueType);
-
-    llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType);
     std::vector<llvm::Value *> args;
-    args.push_back(voidlvalue);
-    args.push_back(rvalue);
-    args.push_back(storeMask);
-    llvm::Value *inst = CallInst(func, AtomicType::Void, args);
+    args.push_back(ptr);
+    args.push_back(value);
+    args.push_back(mask);
+    llvm::Value *inst = CallInst(scatterFunc, NULL, args);
     addGSMetadata(inst, currentPos);
 }
 
 
 void
-FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
-                               const char *name) {
-    if (rvalue == NULL || lvalue == NULL) {
+FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
+    if (value == NULL || ptr == NULL) {
         // may happen due to error elsewhere
         assert(m->errorCount > 0);
         return;
     }
 
     llvm::Instruction *inst;
-    if (llvm::isa<llvm::VectorType>(rvalue->getType()))
-        // Specify an unaligned store, since we don't know that the lvalue
+    if (llvm::isa<llvm::VectorType>(value->getType()))
+        // FIXME: same for load--do we still need/want this??
+        // Specify an unaligned store, since we don't know that the ptr
         // will in fact be aligned to a vector width here.  (Actually
         // should be aligned to the alignment of the vector elment type...)
-        inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+        inst = new llvm::StoreInst(value, ptr, false /* not volatile */,
                                    1, bblock);
     else
-        inst = new llvm::StoreInst(rvalue, lvalue, bblock);
+        inst = new llvm::StoreInst(value, ptr, bblock);
 
     AddDebugPos(inst);
 }
 
 
 void
-FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
-                               llvm::Value *storeMask, const Type *rvalueType,
-                               const char *name) {
-    if (rvalue == NULL || lvalue == NULL) {
+FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr,
+                               llvm::Value *mask, const Type *ptrType) {
+    if (value == NULL || ptr == NULL) {
         // may happen due to error elsewhere
         assert(m->errorCount > 0);
         return;
     }
 
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        ptrType = PointerType::GetUniform(ptrType->GetReferenceTarget());
+
     // Figure out what kind of store we're doing here
-    if (rvalueType->IsUniformType()) {
-        // The easy case; a regular store, natural alignment is fine
-        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
-        AddDebugPos(si);
+    if (ptrType->IsUniformType()) {
+        if (ptrType->GetBaseType()->IsUniformType())
+            // the easy case
+            StoreInst(value, ptr);
+        else if (mask == LLVMMaskAllOn)
+            // Otherwise it is a masked store unless we can determine that the
+            // mask is all on...  (Unclear if this check is actually useful.)
+            StoreInst(value, ptr);
+        else
+            maskedStore(value, ptr, ptrType, mask);
     }
-    else if (llvm::isa<LLVM_TYPE_CONST llvm::ArrayType>(lvalue->getType()))
-        // We have a varying lvalue (an array of pointers), so it's time to
+    else {
+        assert(ptrType->IsVaryingType());
+        // We have a varying ptr (an array of pointers), so it's time to
         // scatter
-        scatter(rvalue, lvalue, storeMask, rvalueType);
-    else if (storeMask == LLVMMaskAllOn) {
-        // Otherwise it is a masked store unless we can determine that the
-        // mask is all on...
-        StoreInst(rvalue, lvalue, name);
+        scatter(value, ptr, ptrType, mask);
     }
-    else
-        maskedStore(rvalue, lvalue, rvalueType, storeMask);
 }
 
 
@@ -1983,33 +2008,32 @@ FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0,
 }
 
 
-/* Given a value representing a function to be called or possibly-varying
-   pointer to a function to be called, figure out how many arguments the
-   function has. */
+/** Given a value representing a function to be called or possibly-varying
+    pointer to a function to be called, figure out how many arguments the
+    function has. */
 static unsigned int
-lCalleeArgCount(llvm::Value *callee) {
+lCalleeArgCount(llvm::Value *callee, const FunctionType *funcType) {
     LLVM_TYPE_CONST llvm::FunctionType *ft = 
         llvm::dyn_cast<LLVM_TYPE_CONST llvm::FunctionType>(callee->getType());
+
     if (ft == NULL) {
         LLVM_TYPE_CONST llvm::PointerType *pt =
             llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(callee->getType());
         if (pt == NULL) {
-            // varying...
-            LLVM_TYPE_CONST llvm::ArrayType *at =
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(callee->getType());
-            assert(at != NULL);
-            pt = llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType());
-            assert(pt != NULL);
+            // varying--in this case, it must be the version of the
+            // function that takes a mask
+            return funcType->GetNumParameters() + 1;
         }
         ft = llvm::dyn_cast<LLVM_TYPE_CONST llvm::FunctionType>(pt->getElementType());
     }
+
     assert(ft != NULL);
     return ft->getNumParams();
 }
 
 
 llvm::Value *
-FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
+FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                               const std::vector<llvm::Value *> &args,
                               const char *name) {
     if (func == NULL) {
@@ -2021,17 +2045,13 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
     // Most of the time, the mask is passed as the last argument.  this
     // isn't the case for things like intrinsics, builtins, and extern "C"
     // functions from the application.  Add the mask if it's needed.
-    unsigned int calleeArgCount = lCalleeArgCount(func);
+    unsigned int calleeArgCount = lCalleeArgCount(func, funcType);
     assert(argVals.size() + 1 == calleeArgCount ||
            argVals.size() == calleeArgCount);
     if (argVals.size() + 1 == calleeArgCount)
         argVals.push_back(GetFullMask());
 
-    LLVM_TYPE_CONST llvm::Type *funcType = func->getType();
-    LLVM_TYPE_CONST llvm::ArrayType *funcArrayType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(funcType);
-
-    if (funcArrayType == NULL) {
+    if (llvm::isa<LLVM_TYPE_CONST llvm::VectorType>(func->getType()) == false) {
         // Regular 'uniform' function call--just one function or function
         // pointer, so just emit the IR directly.
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
@@ -2047,10 +2067,10 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
     }
     else {
         // Emit the code for a varying function call, where we have an
-        // array of function pointers, one for each program instance.  The
+        // vector of function pointers, one for each program instance.  The
         // basic strategy is that we go through the function pointers, and
         // for the executing program instances, for each unique function
-        // pointer that's in the array, call that function with a mask
+        // pointer that's in the vector, call that function with a mask
         // equal to the set of active program instances that also have that
         // function pointer.  When all unique function pointers have been
         // called, we're done.
@@ -2059,20 +2079,17 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
         llvm::BasicBlock *bbCall = CreateBasicBlock("varying_funcall_call");
         llvm::BasicBlock *bbDone = CreateBasicBlock("varying_funcall_done");
 
+        // Get the current mask value so we can restore it later
         llvm::Value *origMask = GetInternalMask();
 
-        // First allocate memory to accumulate the various lanes' return
-        // values...
+        // First allocate memory to accumulate the various program
+        // instances' return values...
+        const Type *returnType = funcType->GetReturnType();
         LLVM_TYPE_CONST llvm::Type *llvmReturnType = returnType->LLVMType(g->ctx);
         llvm::Value *resultPtr = NULL;
         if (llvmReturnType->isVoidTy() == false)
             resultPtr = AllocaInst(llvmReturnType);
 
-        // Store the function pointers into an array so that we can index
-        // into them..
-        llvm::Value *funcPtrArray = AllocaInst(funcType);
-        StoreInst(func, funcPtrArray);
-
         // The memory pointed to by maskPointer tracks the set of program
         // instances for which we still need to call the function they are
         // pointing to.  It starts out initialized with the mask of
@@ -2087,7 +2104,7 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
         // bbTest: are any lanes of the mask still on?  If so, jump to
         // bbCall
         SetCurrentBasicBlock(bbTest); {
-            llvm::Value *maskLoad = LoadInst(maskPtr, NULL, NULL);
+            llvm::Value *maskLoad = LoadInst(maskPtr);
             llvm::Value *any = Any(maskLoad);
             BranchInst(bbCall, bbDone, any);
         }
@@ -2097,39 +2114,27 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
         SetCurrentBasicBlock(bbCall); {
             // Figure out the first lane that still needs its function
             // pointer to be called.
-            llvm::Value *currentMask = LoadInst(maskPtr, NULL, NULL);
+            llvm::Value *currentMask = LoadInst(maskPtr);
             llvm::Function *cttz = m->module->getFunction("__count_trailing_zeros");
             assert(cttz != NULL);
-            llvm::Value *firstLane = CallInst(cttz, AtomicType::UniformInt32, 
-                                              LaneMask(currentMask), "first_lane");
+            llvm::Value *firstLane = CallInst(cttz, NULL, LaneMask(currentMask),
+                                              "first_lane");
 
             // Get the pointer to the function we're going to call this time through:
-            // ftpr = funcPtrArray[firstLane]
-            llvm::Value *fpOffset = 
-                GetElementPtrInst(funcPtrArray, LLVMInt32(0), firstLane,
-                                  "func_offset_ptr");
-            llvm::Value *fptr = LoadInst(fpOffset, NULL, NULL);
+            // ftpr = func[firstLane]
+            llvm::Value *fptr = 
+                llvm::ExtractElementInst::Create(func, firstLane, 
+                                                 "extract_fptr", bblock);
 
             // Smear it out into an array of function pointers
-            llvm::Value *fptrSmear = SmearScalar(fptr, "func_ptr");
-
-            // Now convert the smeared array of function pointers and the
-            // given array of function pointers to vectors of int32s or
-            // int64s, where the pointer has been cast to an int of the
-            // appropraite size for the compilation target.
-            LLVM_TYPE_CONST llvm::Type *ptrIntType = g->target.is32bit ?
-                LLVMTypes::Int32Type : LLVMTypes::Int64Type;
-            llvm::Value *fpSmearAsVec = 
-                ArrayToVectorInst(PtrToIntInst(fptrSmear, ptrIntType));
-            llvm::Value *fpOrigAsVec = 
-                ArrayToVectorInst(PtrToIntInst(func, ptrIntType));
+            llvm::Value *fptrSmear = SmearUniform(fptr, "func_ptr");
 
             // fpOverlap = (fpSmearAsVec == fpOrigAsVec).  This gives us a
             // mask for the set of program instances that have the same
             // value for their function pointer.
             llvm::Value *fpOverlap = 
                 CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
-                        fpSmearAsVec, fpOrigAsVec);
+                        fptrSmear, func);
             fpOverlap = I1VecToBoolVec(fpOverlap);
 
             // Figure out the mask to use when calling the function
@@ -2144,14 +2149,23 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
             // Set the mask
             SetInternalMask(callMask);
 
+            // bitcast the i32/64 function pointer to the actual function
+            // pointer type (the variant that includes a mask).
+            LLVM_TYPE_CONST llvm::Type *llvmFuncType =
+                funcType->LLVMFunctionType(g->ctx, true);
+            LLVM_TYPE_CONST llvm::Type *llvmFPtrType = 
+                llvm::PointerType::get(llvmFuncType, 0);
+            llvm::Value *fptrCast = IntToPtrInst(fptr, llvmFPtrType);
+
             // Call the function: callResult = call ftpr(args, args, call mask)
-            llvm::Value *callResult = CallInst(fptr, returnType, args, name);
+            llvm::Value *callResult = CallInst(fptrCast, funcType, args, name);
 
             // Now, do a masked store into the memory allocated to
             // accumulate the result using the call mask.
             if (callResult != NULL) {
                 assert(resultPtr != NULL);
-                StoreInst(callResult, resultPtr, callMask, returnType);
+                StoreInst(callResult, resultPtr, callMask, 
+                          PointerType::GetUniform(returnType));
             }
             else
                 assert(resultPtr == NULL);
@@ -2175,28 +2189,28 @@ FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
         // accumulated in the result memory.
         SetCurrentBasicBlock(bbDone);
         SetInternalMask(origMask);
-        return LoadInst(resultPtr, NULL, NULL);
+        return resultPtr ? LoadInst(resultPtr) : NULL;
     }
 }
 
 
 llvm::Value *
-FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
+FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                               llvm::Value *arg, const char *name) {
     std::vector<llvm::Value *> args;
     args.push_back(arg);
-    return CallInst(func, returnType, args, name);
+    return CallInst(func, funcType, args, name);
 }
 
 
 llvm::Value *
-FunctionEmitContext::CallInst(llvm::Value *func, const Type *returnType,
+FunctionEmitContext::CallInst(llvm::Value *func, const FunctionType *funcType,
                               llvm::Value *arg0, llvm::Value *arg1, 
                               const char *name) {
     std::vector<llvm::Value *> args;
     args.push_back(arg0);
     args.push_back(arg1);
-    return CallInst(func, returnType, args, name);
+    return CallInst(func, funcType, args, name);
 }
 
 
@@ -2210,8 +2224,7 @@ FunctionEmitContext::ReturnInst() {
     if (returnValuePtr != NULL) {
         // We have value(s) to return; load them from their storage
         // location
-        llvm::Value *retVal = LoadInst(returnValuePtr, NULL, NULL,
-                                       "return_value");
+        llvm::Value *retVal = LoadInst(returnValuePtr, "return_value");
         rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock);
     }
     else {
@@ -2249,10 +2262,17 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
 
     llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
     assert(falloc != NULL);
+    llvm::Value *structSize = g->target.SizeOf(argStructType);
+    if (structSize->getType() != LLVMTypes::Int64Type)
+        // ISPCAlloc expects the size as an uint64_t, but on 32-bit
+        // targets, SizeOf returns a 32-bit value
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+                              "struct_size_to_64");
     int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
+
     std::vector<llvm::Value *> allocArgs;
     allocArgs.push_back(launchGroupHandlePtr);
-    allocArgs.push_back(g->target.SizeOf(argStructType));
+    allocArgs.push_back(structSize);
     allocArgs.push_back(LLVMInt32(align));
     llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
     llvm::Value *argmem = BitCastInst(voidmem, pt);
@@ -2260,15 +2280,15 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     // Copy the values of the parameters into the appropriate place in
     // the argument block
     for (unsigned int i = 0; i < argVals.size(); ++i) {
-        llvm::Value *ptr = GetElementPtrInst(argmem, 0, i, "funarg");
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
         // don't need to do masked store here, I think
         StoreInst(argVals[i], ptr);
     }
 
     // copy in the mask
     llvm::Value *mask = GetFullMask();
-    llvm::Value *ptr = GetElementPtrInst(argmem, 0, argVals.size(),
-                                         "funarg_mask");
+    llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
+                                        "funarg_mask");
     StoreInst(mask, ptr);
 
     // And emit the call to the user-supplied task launch function, passing
@@ -2282,13 +2302,13 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
     args.push_back(fptr);
     args.push_back(voidmem);
     args.push_back(launchCount);
-    return CallInst(flaunch, AtomicType::Void, args, "");
+    return CallInst(flaunch, NULL, args, "");
 }
 
 
 void
 FunctionEmitContext::SyncInst() {
-    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL, NULL);
+    llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
     llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
     llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
                                    llvm::CmpInst::ICMP_NE,
@@ -2301,7 +2321,7 @@ FunctionEmitContext::SyncInst() {
     llvm::Function *fsync = m->module->getFunction("ISPCSync");
     if (fsync == NULL)
         FATAL("Couldn't find ISPCSync declaration?!");
-    CallInst(fsync, AtomicType::Void, launchGroupHandle, "");
+    CallInst(fsync, NULL, launchGroupHandle, "");
     BranchInst(bPostSync);
 
     SetCurrentBasicBlock(bPostSync);
@@ -2309,43 +2329,46 @@ FunctionEmitContext::SyncInst() {
 
 
 /** When we gathering from or scattering to a varying atomic type, we need
-    to add an appropraite toffset to the final address for each lane right
+    to add an appropraite offset to the final address for each lane right
     before we use it.  Given a varying pointer we're about to use and its
     type, this function determines whether these offsets are needed and
     returns an updated pointer that incorporates these offsets if needed.
  */
 llvm::Value *
-FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *type) {
-    // We should only have varying pointers here, which are represented as
-    // arrays of pointers in ispc.
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(ptr->getType());
-    assert(at != NULL);
-    LLVM_TYPE_CONST llvm::PointerType *pt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType());
-    assert(pt != NULL);
+FunctionEmitContext::addVaryingOffsetsIfNeeded(llvm::Value *ptr, 
+                                               const Type *ptrType) {
+    // This should only be called for varying pointers
+    const PointerType *pt = dynamic_cast<const PointerType *>(ptrType);
+    assert(pt && pt->IsVaryingType());
 
-    // If we have pointers to vector types, e.g. [8 x <8 x float> *], then
-    // the data we're gathering from/scattering to is varying in memory.
-    // If we have pointers to scalar types, e.g. [8 x float *], then the
-    // data is uniform in memory and doesn't need any additional offsets.
-    if (pt->getElementType()->isIntegerTy() || 
-        pt->getElementType()->isFloatingPointTy() ||
-        pt->getElementType()->isPointerTy())
+    const Type *baseType = ptrType->GetBaseType();
+    assert(dynamic_cast<const AtomicType *>(baseType) != NULL ||
+           dynamic_cast<const EnumType *>(baseType) != NULL ||
+           dynamic_cast<const PointerType *>(baseType));
+    if (baseType->IsUniformType())
         return ptr;
+    
+    // Find the size of a uniform element of the varying type
+    LLVM_TYPE_CONST llvm::Type *llvmBaseUniformType = 
+        baseType->GetAsUniformType()->LLVMType(g->ctx);
+    llvm::Value *unifSize = g->target.SizeOf(llvmBaseUniformType);
+    unifSize = SmearUniform(unifSize);
 
-    llvm::Value *varyingOffsets = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
-    for (int i = 0; i < g->target.vectorWidth; ++i)
-        varyingOffsets = InsertInst(varyingOffsets, LLVMInt32(i), i,
-                                    "varying_delta");
+    // Compute offset = <0, 1, .. > * unifSize
+    llvm::Value *varyingOffsets = llvm::UndefValue::get(unifSize->getType());
+    for (int i = 0; i < g->target.vectorWidth; ++i) {
+        llvm::Value *iValue = (g->target.is32Bit || g->opt.force32BitAddressing) ?
+            LLVMInt32(i) : LLVMInt64(i);
+        varyingOffsets = InsertInst(varyingOffsets, iValue, i, "varying_delta");
+    }
+    llvm::Value *offset = BinaryOperator(llvm::Instruction::Mul, unifSize, 
+                                         varyingOffsets);
+    
+    if (g->opt.force32BitAddressing == true && g->target.is32Bit == false)
+        // On 64-bit targets where we're doing 32-bit addressing
+        // calculations, we need to convert to an i64 vector before adding
+        // to the pointer
+        offset = SExtInst(offset, LLVMTypes::Int64VectorType, "offset_to_64");
 
-    // Cast the pointer type to the corresponding uniform type--e.g. cast
-    // <8 x float> * to float *s.
-    LLVM_TYPE_CONST llvm::Type *unifType = type->GetAsUniformType()->LLVMType(g->ctx);
-    LLVM_TYPE_CONST llvm::PointerType *ptrCastType = 
-        llvm::PointerType::get(llvm::ArrayType::get(unifType, 0), 0);
-    ptr = BitCastInst(ptr, ptrCastType, "ptr2unif");
-
-    // And now we can do the per-lane offsets...
-    return GetElementPtrInst(ptr, LLVMInt32(0), varyingOffsets);
+    return BinaryOperator(llvm::Instruction::Add, ptr, offset);
 }
diff --git a/ctx.h b/ctx.h
index b4677acb..b4472f1c 100644
--- a/ctx.h
+++ b/ctx.h
@@ -311,20 +311,13 @@ public:
 
     /** Given a scalar value, return a vector of the same type (or an
         array, for pointer types). */
-    llvm::Value *SmearScalar(llvm::Value *value, const char *name = NULL);
+    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
 
     llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
-                              const char *name = NULL);
+    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
     llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                               const char *name = NULL);
-    /** Given a value of some array type, return the corresponding value of
-        vector type. */
-    llvm::Value *ArrayToVectorInst(llvm::Value *value);
-    /** Given a value of some vector type, return the corresponding value of
-        array type. */
-    llvm::Value *VectorToArrayInst(llvm::Value *value);
 
     llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                  const char *name = NULL);
@@ -337,26 +330,37 @@ public:
     llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                 const char *name = NULL);
 
-    /** This GEP method is a generalization of the standard one in LLVM; it
-        supports both uniform and varying basePtr values (an array of
-        pointers) as well as uniform and varying index values (arrays of
-        indices). */
+    /** These GEP methods are generalizations of the standard ones in LLVM;
+        they support both uniform and varying basePtr values as well as
+        uniform and varying index values (arrays of indices).  Varying base
+        pointers are expected to come in as vectors of i32/i64 (depending
+        on the target), since LLVM doesn't currently support vectors of
+        pointers.  The underlying type of the base pointer must be provided
+        via the ptrType parameter */
+    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
+                                   const Type *ptrType, const char *name = NULL);
     llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
-                                   llvm::Value *index1, const char *name = NULL);
-
-    /** This is a convenience method to generate a GEP instruction with
-        indices with values with known constant values as the ispc program
-        is being compiled. */
-    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
+                                   llvm::Value *index1, const Type *ptrType,
                                    const char *name = NULL);
 
+    /** This method returns a new pointer that represents offsetting the
+        given base pointer to point at the given element number of the
+        structure type that the base pointer points to.  (The provided
+        pointer must be a pointer to a structure type.  The ptrType gives
+        the type of the pointer, though it may be NULL if the base pointer
+        is uniform. */
+    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
+                                  const Type *ptrType, const char *name = NULL);
+
     /** Load from the memory location(s) given by lvalue, using the given
         mask.  The lvalue may be varying, in which case this corresponds to
         a gather from the multiple memory locations given by the array of
         pointer values given by the lvalue.  If the lvalue is not varying,
         then both the mask pointer and the type pointer may be NULL. */
-    llvm::Value *LoadInst(llvm::Value *lvalue, llvm::Value *mask,
-                          const Type *type, const char *name = NULL);
+    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
+                          const Type *ptrType, const char *name = NULL);
+
+    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
 
     /** Emits an alloca instruction to allocate stack storage for the given
         type.  If a non-zero alignment is specified, the object is also
@@ -370,16 +374,14 @@ public:
 
     /** Standard store instruction; for this variant, the lvalue must be a
         single pointer, not a varying lvalue. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
-                   const char *name = NULL);
+    void StoreInst(llvm::Value *value, llvm::Value *ptr);
 
     /** In this variant of StoreInst(), the lvalue may be varying.  If so,
         this corresponds to a scatter.  Whether the lvalue is uniform of
         varying, the given storeMask is used to mask the stores so that
         they only execute for the active program instances. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
-                   llvm::Value *storeMask, const Type *rvalueType,
-                   const char *name = NULL);
+    void StoreInst(llvm::Value *value, llvm::Value *ptr,
+                   llvm::Value *storeMask, const Type *ptrType);
 
     void BranchInst(llvm::BasicBlock *block);
     void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -401,20 +403,22 @@ public:
     llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                   llvm::Value *val1, const char *name = NULL);
 
-    /** Emits IR to do a function call with the given arguments.  The
-        function return type must be provided in returnType. */
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+    /** Emits IR to do a function call with the given arguments.  If the
+        function type is a varying function pointer type, its full type
+        must be provided in funcType.  funcType can be NULL if func is a
+        uniform function pointer. */
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                           const std::vector<llvm::Value *> &args,
                           const char *name = NULL);
 
     /** This is a convenience method that issues a call instruction to a
         function that takes just a single argument. */
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                           llvm::Value *arg, const char *name = NULL);
 
     /** This is a convenience method that issues a call instruction to a
         function that takes two arguments. */
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                           llvm::Value *arg0, llvm::Value *arg1,
                           const char *name = NULL);
 
@@ -530,15 +534,18 @@ private:
     void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
     llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
 
+    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
+                                 const Type *ptrType);
+
     void restoreMaskGivenReturns(llvm::Value *oldMask);
 
-    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
-                 llvm::Value *maskPtr, const Type *rvalueType);
-    llvm::Value *gather(llvm::Value *lvalue, llvm::Value *mask,
-                        const Type *type, const char *name);
-    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
-                     const Type *rvalueType, llvm::Value *maskPtr);
-    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *value, const Type *type);
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
+                 llvm::Value *mask);
+    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
+                     llvm::Value *mask);
+    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
+                        const char *name);
+    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };
 
 #endif // ISPC_CTX_H
diff --git a/decl.cpp b/decl.cpp
index 1dad7571..e17ed88c 100644
--- a/decl.cpp
+++ b/decl.cpp
@@ -46,12 +46,14 @@
 #include <stdio.h>
 #include <llvm/Module.h>
 
+/** Given a Type and a set of type qualifiers, apply the type qualifiers to
+    the type, returning the type that is the result. 
+*/
 static const Type *
 lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
     if (type == NULL)
         return NULL;
 
-    // Account for 'unsigned' and 'const' qualifiers in the type
     if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
         const Type *unsignedType = type->GetAsUnsignedType();
         if (unsignedType != NULL)
@@ -60,23 +62,21 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
             Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
               type->GetString().c_str());
     }
+
     if ((typeQualifiers & TYPEQUAL_CONST) != 0)
         type = type->GetAsConstType();
 
-    // if uniform/varying is specified explicitly, then go with that
-    if (dynamic_cast<const FunctionType *>(type) == NULL) {
-        if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
+        type = type->GetAsUniformType();
+    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
+        type = type->GetAsVaryingType();
+    else {
+        // otherwise, structs are uniform by default and everything
+        // else is varying by default
+        if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
             type = type->GetAsUniformType();
-        else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
+        else
             type = type->GetAsVaryingType();
-        else {
-            // otherwise, structs are uniform by default and everything
-            // else is varying by default
-            if (dynamic_cast<const StructType *>(type->GetBaseType()) != NULL)
-                type = type->GetAsUniformType();
-            else
-                type = type->GetAsVaryingType();
-        }
     }
 
     return type;
@@ -127,7 +127,6 @@ DeclSpecs::Print() const {
     if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
     if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
     if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
-    if (typeQualifiers & TYPEQUAL_REFERENCE) printf("reference ");
     if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
 
     printf("%s", baseType->GetString().c_str());
@@ -161,8 +160,10 @@ Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
 
 
 Symbol *
-Declarator::GetSymbol() {
-    Declarator *d = this;
+Declarator::GetSymbol() const {
+    // The symbol lives at the last child in the chain, so walk down there
+    // and return the one there.
+    const Declarator *d = this;
     while (d->child != NULL)
         d = d->child;
     return d->sym;
@@ -171,7 +172,12 @@ Declarator::GetSymbol() {
 
 void
 Declarator::Print() const {
-    printf("%s", sym->name.c_str());
+    Symbol *sym = GetSymbol();
+    if (sym != NULL)
+        printf("%s", sym->name.c_str());
+    else
+        printf("(null symbol)");
+
     if (initExpr != NULL) {
         printf(" = (");
         initExpr->Print();
@@ -181,28 +187,39 @@ Declarator::Print() const {
 }
 
 
-void
-Declarator::GetFunctionInfo(DeclSpecs *ds, Symbol **funSym, 
-                            std::vector<Symbol *> *funArgs) {
-    // Get the symbol for the function from the symbol table.  (It should
-    // already have been added to the symbol table by AddGlobal() by the
-    // time we get here.)
+Symbol *
+Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
     const FunctionType *type = 
         dynamic_cast<const FunctionType *>(GetType(ds));
     if (type == NULL)
-        return;
+        return NULL;
+
     Symbol *declSym = GetSymbol();
     assert(declSym != NULL);
-    *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
-    if (*funSym != NULL)
-        // May be NULL due to error earlier in compilation
-        (*funSym)->pos = pos;
 
-    for (unsigned int i = 0; i < functionArgs.size(); ++i) {
-        Declaration *pdecl = functionArgs[i];
+    // Get the symbol for the function from the symbol table.  (It should
+    // already have been added to the symbol table by AddGlobal() by the
+    // time we get here.)
+    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
+    if (funSym != NULL)
+        // May be NULL due to error earlier in compilation
+        funSym->pos = pos;
+
+    // Walk down to the declarator for the function.  (We have to get past
+    // the stuff that specifies the function's return type before we get to
+    // the function's declarator.)
+    Declarator *d = this;
+    while (d != NULL && d->kind != DK_FUNCTION)
+        d = d->child;
+    assert(d != NULL);
+
+    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
+        Declaration *pdecl = d->functionParams[i];
         assert(pdecl->declarators.size() == 1);
         funArgs->push_back(pdecl->declarators[0]->GetSymbol());
     }
+
+    return funSym;
 }
 
 
@@ -211,7 +228,6 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
     bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
     bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
     bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
-    bool isReference =    ((typeQualifiers & TYPEQUAL_REFERENCE) != 0);
     bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
 
     if (hasUniformQual && hasVaryingQual) {
@@ -224,13 +240,36 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
     const Type *type = base;
     switch (kind) {
     case DK_BASE:
+        // All of the type qualifiers should be in the DeclSpecs for the
+        // base declarator
         assert(typeQualifiers == 0);
         assert(child == NULL);
         return type;
 
     case DK_POINTER:
         type = new PointerType(type, hasUniformQual, isConst);
-        if (child)
+        if (child != NULL)
+            return child->GetType(type, ds);
+        else
+            return type;
+        break;
+
+    case DK_REFERENCE:
+        if (hasUniformQual)
+            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
+        if (hasVaryingQual)
+            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
+        if (isConst)
+            Error(pos, "\"const\" qualifier is to illegal apply to references.");
+
+        // The parser should disallow this already, but double check.
+        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+            Error(pos, "References to references are illegal.");
+            return NULL;
+        }
+
+        type = new ReferenceType(type);
+        if (child != NULL)
             return child->GetType(type, ds);
         else
             return type;
@@ -250,10 +289,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
         std::vector<ConstExpr *> argDefaults;
         std::vector<SourcePos> argPos;
 
-        // Loop over the function arguments and get names and types for
-        // each one in the args and argNames arrays
-        for (unsigned int i = 0; i < functionArgs.size(); ++i) {
-            Declaration *d = functionArgs[i];
+        // Loop over the function arguments and store the names, types,
+        // default values (if any), and source file positions each one in
+        // the corresponding vector.
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
+            Declaration *d = functionParams[i];
+
             char buf[32];
             Symbol *sym;
             if (d->declarators.size() == 0) {
@@ -266,6 +307,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             else {
                 sym = d->declarators[0]->GetSymbol();
                 if (sym == NULL) {
+                    // Handle more complex anonymous declarations like
+                    // float (float **).
                     sprintf(buf, "__anon_parameter_%d", i);
                     sym = new Symbol(buf, pos);
                     sym->type = d->declarators[0]->GetType(d->declSpecs);
@@ -274,9 +317,15 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
 
             const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
             if (at != NULL) {
-                // Arrays are passed by reference, so convert array
-                // parameters to be references here.
-                sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+                // As in C, arrays are passed to functions as pointers to
+                // their element type.  We'll just immediately make this
+                // change now.  (One shortcoming of losing the fact that
+                // the it was originally an array is that any warnings or
+                // errors later issued that print the function type will
+                // report this differently than it was originally declared
+                // in the function, but it's not clear that this is a
+                // significant problem.)
+                sym->type = PointerType::GetUniform(at->GetElementType());
 
                 // Make sure there are no unsized arrays (other than the
                 // first dimension) in function parameter lists.
@@ -296,6 +345,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
 
             ConstExpr *init = NULL;
             if (d->declarators.size()) {
+                // Try to find an initializer expression; if there is one,
+                // it lives down to the base declarator.
                 Declarator *decl = d->declarators[0];
                 while (decl->child != NULL) {
                     assert(decl->initExpr == NULL);
@@ -314,11 +365,6 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
             argDefaults.push_back(init);
         }
 
-        if (isReference) {
-            Error(pos, "Function return types can't be reference types.");
-            return NULL;
-        }
-
         const Type *returnType = type;
         if (returnType == NULL) {
             Error(pos, "No return type provided in function declaration.");
@@ -328,6 +374,23 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
         bool isExported = ds && (ds->storageClass == SC_EXPORT);
         bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
         bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
+
+        if (isExported && isTask) {
+            Error(pos, "Function can't have both \"task\" and \"export\" "
+                  "qualifiers");
+            return NULL;
+        }
+        if (isExternC && isTask) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
+                  "qualifiers");
+            return NULL;
+        }
+        if (isExternC && isExported) {
+            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
+                  "qualifiers");
+            return NULL;
+        }
+
         Type *functionType = 
             new FunctionType(returnType, args, pos, argNames, argDefaults,
                              argPos, isTask, isExported, isExternC);
@@ -367,12 +430,6 @@ const Type *
 Declarator::GetType(DeclSpecs *ds) const {
     const Type *baseType = ds->GetBaseType(pos);
     const Type *type = GetType(baseType, ds);
-
-    if ((ds->typeQualifiers & TYPEQUAL_REFERENCE) != 0) {
-        bool hasConstQual = ((ds->typeQualifiers & TYPEQUAL_CONST) != 0);
-        type = new ReferenceType(type, hasConstQual);
-    }
-
     return type;
 }
 
@@ -392,7 +449,7 @@ Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
 
 Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
     declSpecs = ds;
-    if (d) {
+    if (d != NULL) {
         d->InitFromDeclSpecs(ds);
         declarators.push_back(d);
     }
@@ -409,6 +466,8 @@ Declaration::GetVariableDeclarations() const {
             continue;
         Declarator *decl = declarators[i];
         if (decl == NULL || decl->kind == DK_FUNCTION) 
+            // Ignore earlier errors or external function declarations
+            // inside other functions.
             continue;
 
         Symbol *sym = decl->GetSymbol();
@@ -452,14 +511,18 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
             Declarator *d = (*sd[i]->declarators)[j];
             d->InitFromDeclSpecs(&ds);
 
-            // if it's an unsized array, make it a reference to an unsized
-            // array, so the caller can pass a pointer...
             Symbol *sym = d->GetSymbol();
-            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
-            if (at && at->GetElementCount() == 0)
-                sym->type = new ReferenceType(sym->type, type->IsConstType());
 
-            elementTypes->push_back(sym->type);
+            const ArrayType *arrayType = 
+                dynamic_cast<const ArrayType *>(sym->type);
+            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
+                Error(d->pos, "Unsized arrays aren't allowed in struct "
+                      "definitions.");
+                elementTypes->push_back(NULL);
+            }
+            else
+                elementTypes->push_back(sym->type);
+
             elementNames->push_back(sym->name);
             elementPositions->push_back(sym->pos);
         }
diff --git a/decl.h b/decl.h
index 019b251e..de966f41 100644
--- a/decl.h
+++ b/decl.h
@@ -79,9 +79,8 @@ enum StorageClass {
 #define TYPEQUAL_UNIFORM    (1<<1)
 #define TYPEQUAL_VARYING    (1<<2)
 #define TYPEQUAL_TASK       (1<<3)
-#define TYPEQUAL_REFERENCE  (1<<4)
-#define TYPEQUAL_UNSIGNED   (1<<5)
-#define TYPEQUAL_INLINE     (1<<6)
+#define TYPEQUAL_UNSIGNED   (1<<4)
+#define TYPEQUAL_INLINE     (1<<5)
 
 /** @brief Representation of the declaration specifiers in a declaration.
 
@@ -100,7 +99,7 @@ public:
     int typeQualifiers;
 
     /** The basic type provided in the declaration; this should be an
-        AtomicType, a StructType, or a VectorType; other types (like
+        AtomicType, EnumType, StructType, or VectorType; other types (like
         ArrayTypes) will end up being created if a particular declaration
         has an array size, etc.
     */
@@ -123,6 +122,7 @@ public:
 enum DeclaratorKind {
     DK_BASE,
     DK_POINTER,
+    DK_REFERENCE,
     DK_ARRAY,
     DK_FUNCTION
 };
@@ -142,33 +142,51 @@ public:
     void InitFromDeclSpecs(DeclSpecs *ds);
 
     /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs */
+        DeclSpecs.  If an explicit base type is provided, the declarator is
+        applied to that type; otherwise the base type from the DeclSpecs is
+        used. */
     const Type *GetType(DeclSpecs *ds) const;
     const Type *GetType(const Type *base, DeclSpecs *ds) const;
 
-    void GetFunctionInfo(DeclSpecs *ds, Symbol **sym, 
-                         std::vector<Symbol *> *args);
+    /** Returns the symbol corresponding to the function declared by this
+        declarator and symbols for its arguments in *args. */
+    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
 
-    Symbol *GetSymbol();
+    /** Returns the symbol associated with the declarator. */
+    Symbol *GetSymbol() const;
 
     void Print() const;
 
+    /** Position of the declarator in the source program. */
     const SourcePos pos;
 
+    /** The kind of this declarator; complex declarations are assembled as
+        a hierarchy of Declarators.  (For example, a pointer to an int
+        would have a root declarator with kind DK_POINTER and with the
+        Declarator::child member pointing to a DK_BASE declarator for the
+        int). */
     const DeclaratorKind kind;
 
+    /** Child pointer if needed; this can only be non-NULL if the
+        declarator's kind isn't DK_BASE. */
     Declarator *child;
 
+    /** Type qualifiers provided with the declarator. */
     int typeQualifiers;
 
+    /** For array declarators, this gives the declared size of the array.
+        Unsized arrays have arraySize == 0. */ 
     int arraySize;
 
+    /** Symbol associated with the declarator. */
     Symbol *sym;
 
     /** Initialization expression for the variable.  May be NULL. */
     Expr *initExpr;
 
-    std::vector<Declaration *> functionArgs;
+    /** For function declarations, this holds the Declaration *s for the
+        funciton's parameters. */
+    std::vector<Declaration *> functionParams;
 };
 
 
@@ -182,6 +200,11 @@ public:
 
     void Print() const;
 
+    /** This method walks through all of the Declarators in a declaration
+        and returns a fully-initialized Symbol and (possibly) and
+        initialization expression for each one.  (This allows the rest of
+        the system to not have to worry about the mess of the general
+        Declarator representation.) */
     std::vector<VariableDeclaration> GetVariableDeclarations() const;
 
     DeclSpecs *declSpecs;
diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc
index e48a544e..3deaa340 100644
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
     return ret;
 }
 
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
     float len2 = dot(v, v);
     float invlen = rsqrt(len2);
     v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
 
 
 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
-                    reference Plane plane) {
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
     float d = -dot(plane.p, plane.n);
     float v = dot(ray.dir, plane.n);
 
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 
 
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
-                     reference Sphere sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
     vec rs = ray.org - sphere.center;
 
     float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 
 
 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
     basis[2] = n;
     basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
 
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
 
 
 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+                  RNGState &rngstate) {
     float eps = 0.0001f;
     vec p, n;
     vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
             Ray ray;
             Isect occIsect;
 
-            float theta = sqrt(frandom(rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
             float x = cos(phi) * theta;
             float y = sin(phi) * theta;
             float z = sqrt(1.0 - theta * theta);
@@ -205,7 +203,7 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
  */
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                          uniform int h,  uniform int nsubsamples, 
-                         reference uniform float image[]) {
+                         uniform float image[]) {
     static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
     static Sphere spheres[3] = {
         { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -213,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
         { { 1.0f, 0.0f, -2.2f }, 0.5f } };
     RNGState rngstate;
 
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
 
     // Compute the mapping between the 'programCount'-wide program
     // instances running in parallel and samples in the image.  
diff --git a/examples/aobench_instrumented/ao.ispc b/examples/aobench_instrumented/ao.ispc
index 192e0666..3deaa340 100644
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
     return ret;
 }
 
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
     float len2 = dot(v, v);
     float invlen = rsqrt(len2);
     v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
 
 
 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
-                    reference Plane plane) {
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
     float d = -dot(plane.p, plane.n);
     float v = dot(ray.dir, plane.n);
 
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 
 
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
-                     reference Sphere sphere) {
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
     vec rs = ray.org - sphere.center;
 
     float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 
 
 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
     basis[2] = n;
     basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
 
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
 
 
 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
+                  RNGState &rngstate) {
     float eps = 0.0001f;
     vec p, n;
     vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
             Ray ray;
             Isect occIsect;
 
-            float theta = sqrt(frandom(rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float theta = sqrt(frandom(&rngstate));
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
             float x = cos(phi) * theta;
             float y = sin(phi) * theta;
             float z = sqrt(1.0 - theta * theta);
@@ -203,8 +201,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
    of width w and height h.
  */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
+                         uniform int h,  uniform int nsubsamples, 
+                         uniform float image[]) {
     static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
     static Sphere spheres[3] = {
         { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -212,7 +211,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
         { { 1.0f, 0.0f, -2.2f }, 0.5f } };
     RNGState rngstate;
 
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
 
     // Compute the mapping between the 'programCount'-wide program
     // instances running in parallel and samples in the image.  
@@ -231,6 +230,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
     // direction we do per iteration and ny the number in y.
     uniform int nx = 1, ny = 1;
 
+    // FIXME: We actually need ny to be 1 regardless of the decomposition,
+    // since the task decomposition is one scanline high.
+
     if (programCount == 8) {
         // Do two pixels at once in the x direction
         nx = 2;
@@ -239,19 +241,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
             ++du;
     }
     else if (programCount == 16) {
-        // Two at once in both x and y
-        nx = ny = 2;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        nx = 4;
+        ny = 1;
+        if (programIndex >= 4 && programIndex < 8)
             ++du;
-        if (programIndex >= 8)  
-            ++dv;
+        if (programIndex >= 8 && programIndex < 12)
+            du += 2;
+        if (programIndex >= 12)
+            du += 3;
     }
 
     // Now loop over all of the pixels, stepping in x and y as calculated
     // above.  (Assumes that ny divides y and nx divides x...)
     for (uniform int y = y0; y < y1; y += ny) {
         for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
             float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
             float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
             float ret = 0.f;
@@ -293,7 +297,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
 
             // offset to the first pixel in the image
             uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                 // Get the four sample values for this pixel
                 uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                     retArray[p+3];
@@ -315,3 +319,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                     uniform float image[]) {
     ao_scanlines(0, h, w, h, nsubsamples, image);
 }
+
+
+static void task ao_task(uniform int width, uniform int height, 
+                         uniform int nsubsamples, uniform float image[]) {
+    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
+}
+
+
+export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
+                          uniform float image[]) {
+    launch[h] < ao_task(w, h, nsubsamples, image) >;
+}
diff --git a/examples/deferred/kernels.ispc b/examples/deferred/kernels.ispc
index 65fa1547..6ade1d82 100644
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,22 +35,22 @@
 
 struct InputDataArrays
 {
-    uniform float zBuffer[];
-    uniform unsigned int16 normalEncoded_x[]; // half float
-    uniform unsigned int16 normalEncoded_y[]; // half float
-    uniform unsigned int16 specularAmount[]; // half float
-    uniform unsigned int16 specularPower[]; // half float
-    uniform unsigned int8 albedo_x[]; // unorm8
-    uniform unsigned int8 albedo_y[]; // unorm8
-    uniform unsigned int8 albedo_z[]; // unorm8
-    uniform float lightPositionView_x[];
-    uniform float lightPositionView_y[];
-    uniform float lightPositionView_z[];
-    uniform float lightAttenuationBegin[];
-    uniform float lightColor_x[];
-    uniform float lightColor_y[];
-    uniform float lightColor_z[];
-    uniform float lightAttenuationEnd[];
+    uniform float * uniform zBuffer;
+    uniform unsigned int16 * uniform normalEncoded_x; // half float
+    uniform unsigned int16 * uniform normalEncoded_y; // half float
+    uniform unsigned int16 * uniform specularAmount; // half float
+    uniform unsigned int16 * uniform specularPower; // half float
+    uniform unsigned int8 * uniform albedo_x; // unorm8
+    uniform unsigned int8 * uniform albedo_y; // unorm8
+    uniform unsigned int8 * uniform albedo_z; // unorm8
+    uniform float * uniform lightPositionView_x;
+    uniform float * uniform lightPositionView_y;
+    uniform float * uniform lightPositionView_z;
+    uniform float * uniform lightAttenuationBegin;
+    uniform float * uniform lightColor_x;
+    uniform float * uniform lightColor_y;
+    uniform float * uniform lightColor_z;
+    uniform float * uniform lightAttenuationEnd;
 };
 
 struct InputHeader
@@ -77,8 +77,7 @@ dot3(float x, float y, float z, float a, float b, float c) {
 
 
 static inline void
-normalize3(float x, float y, float z, reference float ox, 
-           reference float oy, reference float oz) {
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
     float n = rsqrt(x*x + y*y + z*z);
     ox = x * n;
     oy = y * n;
@@ -110,8 +109,8 @@ ComputeZBounds(
     uniform float cameraProj_33, uniform float cameraProj_43,
     uniform float cameraNear, uniform float cameraFar,
     // Output
-    reference uniform float minZ,
-    reference uniform float maxZ
+    uniform float &minZ,
+    uniform float &maxZ
     )
 {
     // Find Z bounds
@@ -156,7 +155,7 @@ IntersectLightsWithTileMinMax(
     uniform float light_positionView_z_array[],
     uniform float light_attenuationEnd_array[],
     // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
     )
 {
     uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
@@ -268,7 +267,7 @@ IntersectLightsWithTile(
     uniform float light_positionView_z_array[],
     uniform float light_attenuationEnd_array[],
     // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
     )
 {
     uniform float minZ, maxZ;
@@ -293,19 +292,19 @@ ShadeTile(
     uniform int32 tileStartX, uniform int32 tileEndX,
     uniform int32 tileStartY, uniform int32 tileEndY,
     uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    reference uniform InputDataArrays inputData,
+    uniform InputDataArrays &inputData,
     // Camera data
     uniform float cameraProj_11, uniform float cameraProj_22,
     uniform float cameraProj_33, uniform float cameraProj_43,
     // Light list
-    reference uniform int32 tileLightIndices[],
+    uniform int32 tileLightIndices[],
     uniform int32 tileNumLights,
     // UI
     uniform bool visualizeLightCount,
     // Output
-    reference uniform unsigned int8 framebuffer_r[],
-    reference uniform unsigned int8 framebuffer_g[],
-    reference uniform unsigned int8 framebuffer_b[]
+    uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_b[]
     )
 {
     if (tileNumLights == 0 || visualizeLightCount) {
@@ -478,13 +477,13 @@ ShadeTile(
 
 task void
 RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           reference uniform InputHeader inputHeader,
-           reference uniform InputDataArrays inputData,
+           uniform InputHeader &inputHeader,
+           uniform InputDataArrays &inputData,
            uniform int visualizeLightCount,
            // Output
-           reference uniform unsigned int8 framebuffer_r[],
-           reference uniform unsigned int8 framebuffer_g[],
-           reference uniform unsigned int8 framebuffer_b[]) {
+           uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_b[]) {
     uniform int32 group_y = taskIndex / num_groups_x;
     uniform int32 group_x = taskIndex % num_groups_x;
     uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
@@ -526,13 +525,13 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
 
 
 export void
-RenderStatic(reference uniform InputHeader inputHeader,
-             reference uniform InputDataArrays inputData,
+RenderStatic(uniform InputHeader &inputHeader,
+             uniform InputDataArrays &inputData,
              uniform int visualizeLightCount,
              // Output
-             reference uniform unsigned int8 framebuffer_r[],
-             reference uniform unsigned int8 framebuffer_g[],
-             reference uniform unsigned int8 framebuffer_b[]) {
+             uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_b[]) {
     uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                 MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
     uniform int num_groups_y = (inputHeader.framebufferHeight + 
@@ -564,8 +563,8 @@ ComputeZBoundsRow(
     uniform float cameraProj_33, uniform float cameraProj_43,
     uniform float cameraNear, uniform float cameraFar,
     // Output
-    reference uniform float minZArray[],
-    reference uniform float maxZArray[]
+    uniform float minZArray[],
+    uniform float maxZArray[]
     )
 {
     for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
@@ -596,7 +595,7 @@ SplitTileMinMax(
     // Camera data
     uniform float cameraProj_11, uniform float cameraProj_22,
     // Light Data
-    reference uniform int32 lightIndices[],
+    uniform int32 lightIndices[],
     uniform int32 numLights,
     uniform float light_positionView_x_array[],
     uniform float light_positionView_y_array[],
@@ -605,9 +604,9 @@ SplitTileMinMax(
     // Outputs
     // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
     // indexing math ourselves
-    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndices[],
     uniform int32 subtileIndicesPitch,
-    reference uniform int32 subtileNumLights[]
+    uniform int32 subtileNumLights[]
     )
 {
     uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
diff --git a/examples/mandelbrot/mandelbrot.ispc b/examples/mandelbrot/mandelbrot.ispc
index ecbb4fc1..9243b52a 100644
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
                             uniform float x1, uniform float y1,
                             uniform int width, uniform int height, 
                             uniform int maxIterations,
-                            reference uniform int output[])
+                            uniform int output[])
 {
     float dx = (x1 - x0) / width;
     float dy = (y1 - y0) / height;
diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc
index e52725df..d4ffeff5 100644
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -57,7 +57,7 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
                      uniform float x0, uniform float dx, 
                      uniform float y0, uniform float dy,
                      uniform int width, uniform int maxIterations,
-                     reference uniform int output[]) {
+                     uniform int output[]) {
     uniform int ystart = ybase + taskIndex * span;
     uniform int yend = ystart + span;
 
@@ -77,7 +77,7 @@ task void
 mandelbrot_chunk(uniform float x0, uniform float dx,
                  uniform float y0, uniform float dy,
                  uniform int width, uniform int height,
-                 uniform int maxIterations, reference uniform int output[]) {
+                 uniform int maxIterations, uniform int output[]) {
     uniform int ystart = taskIndex * (height/taskCount);
     uniform int yend = (taskIndex+1) * (height/taskCount);
     uniform int span = 1;
@@ -91,7 +91,7 @@ export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                 uniform float x1, uniform float y1,
                 uniform int width, uniform int height, 
-                uniform int maxIterations, reference uniform int output[]) {
+                uniform int maxIterations, uniform int output[]) {
     uniform float dx = (x1 - x0) / width;
     uniform float dy = (y1 - y0) / height;
 
diff --git a/examples/rt/rt.ispc b/examples/rt/rt.ispc
index 88a4a7f6..47abee80 100644
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -73,7 +73,7 @@ static inline float Dot(const float3 a, const float3 b) {
 
 static void generateRay(uniform const float raster2camera[4][4], 
                         uniform const float camera2world[4][4],
-                        float x, float y, reference Ray ray) {
+                        float x, float y, Ray &ray) {
     ray.mint = 0.f;
     ray.maxt = 1e30f;
 
@@ -105,7 +105,7 @@ static void generateRay(uniform const float raster2camera[4][4],
 
 
 static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const reference Ray ray) {
+                                 const Ray &ray) {
     uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
     uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
     float t0 = ray.mint, t1 = ray.maxt;
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
 
 
 
-static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
     uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
     uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
     uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -184,7 +184,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
 
 
 bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
-                  reference Ray r) {
+                  Ray &r) {
     Ray ray = r;
     bool hit = false;
     // Follow ray through BVH nodes to find primitive intersections
diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile
index fa8ff753..0f3f83b2 100644
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -8,7 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64 --opt=32-bit-addressing
 
 OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
 	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
diff --git a/examples/volume_rendering/volume.ispc b/examples/volume_rendering/volume.ispc
index 39a5a734..c4bc0c1a 100644
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -41,7 +41,7 @@ struct Ray {
 static void
 generateRay(const uniform float raster2camera[4][4], 
             const uniform float camera2world[4][4],
-            float x, float y, reference Ray ray) {
+            float x, float y, Ray &ray) {
     // transform raster coordinate (x, y, 0) to camera space
     float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
     float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
@@ -70,7 +70,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
 
 
 static bool
-IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
     float t0 = -1e30, t1 = 1e30;
 
     float3 tNear = (pMin - ray.origin) / ray.dir;
@@ -141,7 +141,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 
 static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
                             uniform float density[], uniform int nVoxels[3],
-                            reference uniform bool checkForSameVoxel) {
+                            uniform bool &checkForSameVoxel) {
     if (!Inside(Pobj, pMin, pMax)) 
         return 0;
     // Compute voxel coordinates and offsets for _Pobj_
@@ -155,8 +155,8 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
     // Trilinearly interpolate density values to compute local density
     float d00, d10, d01, d11;
     uniform int uvx, uvy, uvz;
-    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
-        reduce_equal(vz, uvz)) {
+    if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
+        reduce_equal(vz, &uvz)) {
         // If all of the program instances are inside the same voxel, then
         // we'll call the 'uniform' variant of the voxel density lookup
         // function, thus doing a single load for each value rather than a
diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj
index 12298017..423c26fc 100644
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -158,13 +158,13 @@
       <FileType>Document</FileType>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 --opt=32-bit-addressing
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 --opt=32-bit-addressing
 </Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
diff --git a/expr.cpp b/expr.cpp
index efa202f7..1ddabb44 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -68,6 +68,14 @@ Expr::GetLValue(FunctionEmitContext *ctx) const {
 }
 
 
+const Type *
+Expr::GetLValueType() const {
+    // This also only needs to be overrided by Exprs that implement the
+    // GetLValue() method.
+    return NULL;
+}
+
+
 llvm::Constant *
 Expr::GetConstant(const Type *type) const {
     // The default is failure; just return NULL
@@ -78,7 +86,7 @@ Expr::GetConstant(const Type *type) const {
 Symbol *
 Expr::GetBaseSymbol() const {
     // Not all expressions can do this, so provide a generally-useful
-    // default
+    // default implementation.
     return NULL;
 }
 
@@ -155,14 +163,6 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
         return false;
     }
 
-    if (toType->IsUniformType() && fromType->IsVaryingType()) {
-        if (!failureOk)
-            Error(pos, "Can't convert from varying type \"%s\" to uniform "
-                  "type \"%s\" for %s.", fromType->GetString().c_str(), 
-                  toType->GetString().c_str(), errorMsgBase);
-        return false;
-    }
-
     const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
     const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
     const VectorType *toVectorType = dynamic_cast<const VectorType *>(toType);
@@ -173,34 +173,78 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
     const EnumType *fromEnumType = dynamic_cast<const EnumType *>(fromType);
     const AtomicType *toAtomicType = dynamic_cast<const AtomicType *>(toType);
     const AtomicType *fromAtomicType = dynamic_cast<const AtomicType *>(fromType);
-
     const PointerType *fromPointerType = dynamic_cast<const PointerType *>(fromType);
     const PointerType *toPointerType = dynamic_cast<const PointerType *>(toType);
+
+    // Do this early, since for the case of a conversion like
+    // "float foo[10]" -> "float * uniform foo", we have what's seemingly
+    // a varying to uniform conversion (but not really)
+    if (fromArrayType != NULL && toPointerType != NULL) {
+        // array to pointer to array element type
+        const Type *eltType = fromArrayType->GetElementType();
+        if (toPointerType->GetBaseType()->IsConstType())
+            eltType = eltType->GetAsConstType();
+        if (Type::Equal(toPointerType, 
+                        new PointerType(eltType,
+                                        toPointerType->IsUniformType(),
+                                        toPointerType->IsConstType())))
+            goto typecast_ok;
+        else {
+            if (!failureOk)
+                Error(pos, "Can't convert from incompatible array type \"%s\" "
+                      "to pointer type \"%s\" for %s.", 
+                      fromType->GetString().c_str(),
+                      toType->GetString().c_str(), errorMsgBase);
+            return false;
+        }
+    }
+
+    if (toType->IsUniformType() && fromType->IsVaryingType()) {
+        if (!failureOk)
+            Error(pos, "Can't convert from varying type \"%s\" to uniform "
+                  "type \"%s\" for %s.", fromType->GetString().c_str(), 
+                  toType->GetString().c_str(), errorMsgBase);
+        return false;
+    }
+
     if (fromPointerType != NULL) {
         if (dynamic_cast<const AtomicType *>(toType) != NULL &&
             toType->IsBoolType())
             // Allow implicit conversion of pointers to bools
             goto typecast_ok;
 
+        if (toArrayType != NULL &&
+            Type::Equal(fromType->GetBaseType(), toArrayType->GetElementType())) {
+            // Can convert pointers to arrays of the same type
+            goto typecast_ok;
+        }
         if (toPointerType == NULL) {
             if (!failureOk)
                 Error(pos, "Can't convert between from pointer type "
-                      "\"%s\" to non-pointer type \"%s\".", 
+                      "\"%s\" to non-pointer type \"%s\" for %s.", 
                       fromType->GetString().c_str(),
-                      toType->GetString().c_str());
+                      toType->GetString().c_str(), errorMsgBase);
             return false;
         }
-        else if (Type::Equal(fromPointerType->GetAsUniformType()->GetAsConstType(),
-                             PointerType::Void)) {
-            // void *s can be converted to any other pointer type
+        else if (PointerType::IsVoidPointer(toPointerType)) {
+            // any pointer type can be converted to a void *
+            goto typecast_ok;
+        }
+        else if (PointerType::IsVoidPointer(fromPointerType) &&
+                 expr != NULL &&
+                 dynamic_cast<NullPointerExpr *>(*expr) != NULL) {
+            // and a NULL convert to any other pointer type
             goto typecast_ok;
         }
         else if (!Type::Equal(fromPointerType->GetBaseType(), 
+                              toPointerType->GetBaseType()) &&
+                 !Type::Equal(fromPointerType->GetBaseType()->GetAsConstType(), 
                               toPointerType->GetBaseType())) {
             if (!failureOk)
                 Error(pos, "Can't convert between incompatible pointer types "
-                      "\"%s\" and \"%s\".", fromPointerType->GetString().c_str(),
-                      toPointerType->GetString().c_str());
+                      "\"%s\" and \"%s\" for %s.",
+                      fromPointerType->GetString().c_str(),
+                      toPointerType->GetString().c_str(), errorMsgBase);
             return false;
         }
 
@@ -269,8 +313,8 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
             return false;
         }
         else
-            return lDoTypeConv(new ReferenceType(fromType, toType->IsConstType()),
-                               toType, NULL, failureOk, errorMsgBase, pos);
+            return lDoTypeConv(new ReferenceType(fromType), toType, NULL, 
+                               failureOk, errorMsgBase, pos);
     }
     else if (Type::Equal(toType, fromType->GetAsNonConstType()))
         // convert: const T -> T (as long as T isn't a reference)
@@ -278,25 +322,18 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
 
     fromType = fromType->GetReferenceTarget();
     toType = toType->GetReferenceTarget();
-
     if (toArrayType && fromArrayType) {
         if (Type::Equal(toArrayType->GetElementType(), 
                         fromArrayType->GetElementType())) {
             // the case of different element counts should have returned
-            // out earlier, yes??
+            // successfully earlier, yes??
             assert(toArrayType->GetElementCount() != fromArrayType->GetElementCount());
-            if (expr != NULL)
-                *expr = new TypeCastExpr(new ReferenceType(toType, false), 
-                                         *expr, false, pos);
-            return true;
+            goto typecast_ok;
         }
         else if (Type::Equal(toArrayType->GetElementType(), 
                              fromArrayType->GetElementType()->GetAsConstType())) {
             // T[x] -> const T[x]
-            if (expr != NULL)
-                *expr = new TypeCastExpr(new ReferenceType(toType, false), 
-                                         *expr, false, pos);
-            return true;
+            goto typecast_ok;
         }
         else {
             if (!failureOk)
@@ -324,8 +361,8 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
                          fromStructType->GetAsUniformType()->GetAsConstType())) {
             if (!failureOk)
                 Error(pos, "Can't convert between different struct types "
-                      "\"%s\" -> \"%s\".", fromStructType->GetString().c_str(),
-                      toStructType->GetString().c_str());
+                      "\"%s\" and \"%s\" for %s.", fromStructType->GetString().c_str(),
+                      toStructType->GetString().c_str(), errorMsgBase);
             return false;
         }
         goto typecast_ok;
@@ -333,12 +370,12 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
 
     if (toEnumType != NULL && fromEnumType != NULL) {
         // No implicit conversions between different enum types
-        if (!Type::Equal(toEnumType->GetAsUniformType()->GetAsConstType(),
-                         fromEnumType->GetAsUniformType()->GetAsConstType())) {
+        if (!Type::EqualIgnoringConst(toEnumType->GetAsUniformType(),
+                                     fromEnumType->GetAsUniformType())) {
             if (!failureOk)
                 Error(pos, "Can't convert between different enum types "
-                      "\"%s\" -> \"%s\".", fromEnumType->GetString().c_str(),
-                      toEnumType->GetString().c_str());
+                      "\"%s\" and \"%s\" for %s", fromEnumType->GetString().c_str(),
+                      toEnumType->GetString().c_str(), errorMsgBase);
             return false;
         }
         goto typecast_ok;
@@ -382,8 +419,10 @@ lDoTypeConv(const Type *fromType, const Type *toType, Expr **expr,
 
 
 bool
-CanConvertTypes(const Type *fromType, const Type *toType) {
-    return lDoTypeConv(fromType, toType, NULL, true, NULL, SourcePos());
+CanConvertTypes(const Type *fromType, const Type *toType, 
+                const char *errorMsgBase, SourcePos pos) {
+    return lDoTypeConv(fromType, toType, NULL, errorMsgBase == NULL,
+                       errorMsgBase, pos);
 }
 
 
@@ -431,10 +470,12 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
     const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
     const EnumType *enumType = dynamic_cast<const EnumType *>(type);
     const VectorType *vectorType = dynamic_cast<const VectorType *>(type);
+    const PointerType *pointerType = dynamic_cast<const PointerType *>(type);
 
     // This function is only called with, and only works for atomic, enum,
     // and vector types.
-    assert(atomicType != NULL || enumType != NULL || vectorType != NULL);
+    assert(atomicType != NULL || enumType != NULL || vectorType != NULL ||
+           pointerType != NULL);
 
     if (atomicType != NULL || enumType != NULL) {
         // If it's an atomic or enuemrator type, then figure out which of
@@ -500,42 +541,56 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
             return NULL;
         }
     }
-
-    // For vector types, first get the LLVM constant for the basetype with
-    // a recursive call to lLLVMConstantValue().
-    const Type *baseType = vectorType->GetBaseType();
-    llvm::Constant *constElement = lLLVMConstantValue(baseType, ctx, value);
-    LLVM_TYPE_CONST llvm::Type *llvmVectorType = vectorType->LLVMType(ctx);
-
-    // Now create a constant version of the corresponding LLVM type that we
-    // use to represent the VectorType.
-    // FIXME: this is a little ugly in that the fact that ispc represents
-    // uniform VectorTypes as LLVM VectorTypes and varying VectorTypes as
-    // LLVM ArrayTypes leaks into the code here; it feels like this detail
-    // should be better encapsulated?
-    if (baseType->IsUniformType()) {
-        LLVM_TYPE_CONST llvm::VectorType *lvt = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(llvmVectorType);
-        assert(lvt != NULL);
-        std::vector<llvm::Constant *> vals;
-        for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
-            vals.push_back(constElement);
-	return llvm::ConstantVector::get(vals);
+    else if (pointerType != NULL) {
+        assert(value == 0);
+        if (pointerType->IsUniformType())
+            return llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
+        else
+            return llvm::Constant::getNullValue(LLVMTypes::VoidPointerVectorType);
     }
     else {
-        LLVM_TYPE_CONST llvm::ArrayType *lat = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmVectorType);
-        assert(lat != NULL);
-        std::vector<llvm::Constant *> vals;
-        for (unsigned int i = 0; i < lat->getNumElements(); ++i)
-            vals.push_back(constElement);
-        return llvm::ConstantArray::get(lat, vals);
+        // For vector types, first get the LLVM constant for the basetype with
+        // a recursive call to lLLVMConstantValue().
+        const Type *baseType = vectorType->GetBaseType();
+        llvm::Constant *constElement = lLLVMConstantValue(baseType, ctx, value);
+        LLVM_TYPE_CONST llvm::Type *llvmVectorType = vectorType->LLVMType(ctx);
+
+        // Now create a constant version of the corresponding LLVM type that we
+        // use to represent the VectorType.
+        // FIXME: this is a little ugly in that the fact that ispc represents
+        // uniform VectorTypes as LLVM VectorTypes and varying VectorTypes as
+        // LLVM ArrayTypes leaks into the code here; it feels like this detail
+        // should be better encapsulated?
+        if (baseType->IsUniformType()) {
+            LLVM_TYPE_CONST llvm::VectorType *lvt = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(llvmVectorType);
+            assert(lvt != NULL);
+            std::vector<llvm::Constant *> vals;
+            for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
+                vals.push_back(constElement);
+            return llvm::ConstantVector::get(vals);
+        }
+        else {
+            LLVM_TYPE_CONST llvm::ArrayType *lat = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(llvmVectorType);
+            assert(lat != NULL);
+            std::vector<llvm::Constant *> vals;
+            for (unsigned int i = 0; i < lat->getNumElements(); ++i)
+                vals.push_back(constElement);
+            return llvm::ConstantArray::get(lat, vals);
+        }
     }
 }
 
 
 static llvm::Value *
 lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
+    if (dynamic_cast<const PointerType *>(baseSym->type) != NULL)
+        // FIXME: we really only want to do this for dereferencing the
+        // pointer, not for things like pointer arithmetic, when we may be
+        // able to use the internal mask, depending on context...
+        return ctx->GetFullMask();
+
     llvm::Value *mask = (baseSym->parentFunction == ctx->GetFunction() && 
                          baseSym->storageClass != SC_STATIC) ? 
         ctx->GetInternalMask() : ctx->GetFullMask();
@@ -546,14 +601,15 @@ lMaskForSymbol(Symbol *baseSym, FunctionEmitContext *ctx) {
 /** Store the result of an assignment to the given location. 
  */
 static void
-lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type, 
+lStoreAssignResult(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
                    FunctionEmitContext *ctx, Symbol *baseSym) {
     assert(baseSym != NULL &&
            baseSym->varyingCFDepth <= ctx->VaryingCFDepth());
     if (!g->opt.disableMaskedStoreToStore &&
         baseSym->varyingCFDepth == ctx->VaryingCFDepth() &&
         baseSym->storageClass != SC_STATIC &&
-        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL) {
+        dynamic_cast<const ReferenceType *>(baseSym->type) == NULL &&
+        dynamic_cast<const PointerType *>(baseSym->type) == NULL) {
         // If the variable is declared at the same varying control flow
         // depth as where it's being assigned, then we don't need to do any
         // masking but can just do the assignment as if all the lanes were
@@ -562,10 +618,10 @@ lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type,
         // never be accessed, since those lanes aren't executing, and won't
         // be executing at this scope or any other one before the variable
         // goes out of scope.
-        ctx->StoreInst(rv, lv, LLVMMaskAllOn, type);
+        ctx->StoreInst(value, ptr, LLVMMaskAllOn, ptrType);
     }
     else {
-        ctx->StoreInst(rv, lv, lMaskForSymbol(baseSym, ctx), type);
+        ctx->StoreInst(value, ptr, lMaskForSymbol(baseSym, ctx), ptrType);
     }
 }
 
@@ -577,10 +633,14 @@ static llvm::Value *
 lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
                    FunctionEmitContext *ctx) {
     const Type *type = expr->GetType();
+    if (type == NULL)
+        return NULL;
 
     // Get both the lvalue and the rvalue of the given expression
     llvm::Value *lvalue = NULL, *rvalue = NULL;
+    const Type *lvalueType = NULL;
     if (dynamic_cast<const ReferenceType *>(type) != NULL) {
+        lvalueType = type;
         type = type->GetReferenceTarget();
         lvalue = expr->GetValue(ctx);
 
@@ -589,14 +649,17 @@ lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
     }
     else {
         lvalue = expr->GetLValue(ctx);
+        lvalueType = expr->GetLValueType();
         rvalue = expr->GetValue(ctx);
     }
 
     if (lvalue == NULL) {
         // If we can't get a lvalue, then we have an error here 
-        Error(pos, "Can't %s-%s non-lvalues.",
-              (op == UnaryExpr::PreInc || op == UnaryExpr::PreDec) ? "pre" : "post",
-              (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? "increment" : "decrement");
+        const char *prepost = (op == UnaryExpr::PreInc || 
+                               op == UnaryExpr::PreDec) ? "pre" : "post";
+        const char *incdec = (op == UnaryExpr::PreInc || 
+                              op == UnaryExpr::PostInc) ? "increment" : "decrement";
+        Error(pos, "Can't %s-%s non-lvalues.", prepost, incdec);
         return NULL;
     }
 
@@ -605,25 +668,35 @@ lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos,
     ctx->SetDebugPos(pos);
     llvm::Value *binop = NULL;
     int delta = (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? 1 : -1;
-    llvm::Constant *dval = lLLVMConstantValue(type, g->ctx, delta);
-    if (!type->IsFloatType())
-        binop = ctx->BinaryOperator(llvm::Instruction::Add, rvalue, 
-                                    dval, "val_inc_or_dec");
-    else
-        binop = ctx->BinaryOperator(llvm::Instruction::FAdd, rvalue, 
-                                    dval, "val_inc_or_dec");
+
+    if (dynamic_cast<const PointerType *>(type) != NULL) {
+        const Type *incType = type->IsUniformType() ? AtomicType::UniformInt32 :
+            AtomicType::VaryingInt32;
+        llvm::Constant *dval = lLLVMConstantValue(incType, g->ctx, delta);
+        binop = ctx->GetElementPtrInst(rvalue, dval, type, "ptr_inc_or_dec");
+    }
+    else {
+        llvm::Constant *dval = lLLVMConstantValue(type, g->ctx, delta);
+        if (type->IsFloatType())
+            binop = ctx->BinaryOperator(llvm::Instruction::FAdd, rvalue, 
+                                        dval, "val_inc_or_dec");
+        else
+            binop = ctx->BinaryOperator(llvm::Instruction::Add, rvalue, 
+                                        dval, "val_inc_or_dec");
+    }
 
 #if 0
     if (type->IsUniformType()) {
         if (ctx->VaryingCFDepth() > 0)
             Warning(expr->pos, 
-                    "Modifying \"uniform\" value under \"varying\" control flow.  Beware.");
+                    "Modifying \"uniform\" value under \"varying\" control "
+                    "flow.");
     }
 #endif
 
     // And store the result out to the lvalue
     Symbol *baseSym = expr->GetBaseSymbol();
-    lStoreAssignResult(binop, lvalue, type, ctx, baseSym);
+    lStoreAssignResult(binop, lvalue, lvalueType, ctx, baseSym);
 
     // And then if it's a pre increment/decrement, return the final
     // computed result; otherwise return the previously-grabbed expression
@@ -646,10 +719,12 @@ lEmitNegate(Expr *arg, SourcePos pos, FunctionEmitContext *ctx) {
     llvm::Value *zero = lLLVMConstantValue(type, g->ctx, 0.);
     ctx->SetDebugPos(pos);
     if (type->IsFloatType())
-        return ctx->BinaryOperator(llvm::Instruction::FSub, zero, argVal, "fnegate");
+        return ctx->BinaryOperator(llvm::Instruction::FSub, zero, argVal,
+                                   "fnegate");
     else {
         assert(type->IsIntType());
-        return ctx->BinaryOperator(llvm::Instruction::Sub, zero, argVal, "inegate");
+        return ctx->BinaryOperator(llvm::Instruction::Sub, zero, argVal,
+                                   "inegate");
     }
 }
 
@@ -755,10 +830,10 @@ UnaryExpr::Optimize() {
         // An error will be issued elsewhere...
         return this;
     case Negate: {
-        // Since we currently only handle int32 and floats here, it's safe
-        // to stuff whatever we have into a double, do the negate as a
-        // double, and then return a ConstExpr with the same type as the
-        // original...
+        // Since we currently only handle int32, floats, and doubles here,
+        // it's safe to stuff whatever we have into a double, do the negate
+        // as a double, and then return a ConstExpr with the same type as
+        // the original...
         double v[ISPC_MAX_NVEC];
         int count = constExpr->AsDouble(v);
         for (int i = 0; i < count; ++i)
@@ -821,11 +896,27 @@ UnaryExpr::TypeCheck() {
         return NULL;
 
     if (op == PreInc || op == PreDec || op == PostInc || op == PostDec) {
-        if (!type->IsNumericType()) {
-            Error(expr->pos, "Can only pre/post increment float and integer "
-                  "types, not \"%s\".", type->GetString().c_str());
+        if (type->IsConstType()) {
+            Error(pos, "Can't assign to type \"%s\" on left-hand side of "
+                  "expression.", type->GetString().c_str());
             return NULL;
         }
+
+        if (type->IsNumericType())
+            return this;
+
+        if (dynamic_cast<const PointerType *>(type) == NULL) {
+            Error(expr->pos, "Can only pre/post increment numeric and "
+                  "pointer types, not \"%s\".", type->GetString().c_str());
+            return NULL;
+        }
+
+        if (PointerType::IsVoidPointer(type)) {
+            Error(expr->pos, "Illegal to pre/post increment \"%s\" type.",
+                  type->GetString().c_str());
+            return NULL;
+        }
+
         return this;
     }
 
@@ -950,42 +1041,110 @@ lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val,
     BinaryExpr::Op.
 */
 static llvm::Value *
-lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val,
-                 const Type *type, FunctionEmitContext *ctx, SourcePos pos) {
-    llvm::Instruction::BinaryOps inst;
-    bool isFloatOp = type->IsFloatType();
-    bool isUnsignedOp = type->IsUnsignedType();
+lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *value0, llvm::Value *value1,
+                 const Type *type0, const Type *type1,
+                 FunctionEmitContext *ctx, SourcePos pos) {
+    const PointerType *ptrType = dynamic_cast<const PointerType *>(type0);
 
-    switch (op) {
-    case BinaryExpr::Add:
-        inst = isFloatOp ? llvm::Instruction::FAdd : llvm::Instruction::Add;
-        break;
-    case BinaryExpr::Sub:
-        inst = isFloatOp ? llvm::Instruction::FSub : llvm::Instruction::Sub;
-        break;
-    case BinaryExpr::Mul:
-        inst = isFloatOp ? llvm::Instruction::FMul : llvm::Instruction::Mul;
-        break;
-    case BinaryExpr::Div:
-        if (type->IsVaryingType() && !isFloatOp)
-            PerformanceWarning(pos, "Division with varying integer types is "
-                               "very inefficient."); 
-        inst = isFloatOp ? llvm::Instruction::FDiv : 
-                (isUnsignedOp ? llvm::Instruction::UDiv : llvm::Instruction::SDiv);
-        break;
-    case BinaryExpr::Mod:
-        if (type->IsVaryingType() && !isFloatOp)
-            PerformanceWarning(pos, "Modulus operator with varying types is "
-                               "very inefficient."); 
-        inst = isFloatOp ? llvm::Instruction::FRem : 
-                (isUnsignedOp ? llvm::Instruction::URem : llvm::Instruction::SRem);
-        break;
-    default:
-        FATAL("Invalid op type passed to lEmitBinaryArith()");
-        return NULL;
+    if (ptrType != NULL) {
+        switch (op) {
+        case BinaryExpr::Add:
+            // ptr + integer
+            return ctx->GetElementPtrInst(value0, value1, ptrType, "ptrmath");
+            break;
+        case BinaryExpr::Sub: {
+            if (dynamic_cast<const PointerType *>(type1) != NULL) {
+                // ptr - ptr
+                if (ptrType->IsUniformType()) {
+                    value0 = ctx->PtrToIntInst(value0);
+                    value1 = ctx->PtrToIntInst(value1);
+                }
+
+                // Compute the difference in bytes
+                llvm::Value *delta = 
+                    ctx->BinaryOperator(llvm::Instruction::Sub, value0, value1,
+                                        "ptr_diff");
+
+                // Now divide by the size of the type that the pointer
+                // points to in order to return the difference in elements.
+                LLVM_TYPE_CONST llvm::Type *llvmElementType = 
+                    ptrType->GetBaseType()->LLVMType(g->ctx);
+                llvm::Value *size = g->target.SizeOf(llvmElementType);
+                if (ptrType->IsVaryingType())
+                    size = ctx->SmearUniform(size);
+
+                if (g->target.is32Bit == false && 
+                    g->opt.force32BitAddressing == true) {
+                    // If we're doing 32-bit addressing math on a 64-bit
+                    // target, then trunc the delta down to a 32-bit value.
+                    // (Thus also matching what will be a 32-bit value
+                    // returned from SizeOf above.)
+                    if (ptrType->IsUniformType())
+                        delta = ctx->TruncInst(delta, LLVMTypes::Int32Type,
+                                               "trunc_ptr_delta");
+                    else
+                        delta = ctx->TruncInst(delta, LLVMTypes::Int32VectorType,
+                                               "trunc_ptr_delta");
+                }
+
+                // And now do the actual division
+                return ctx->BinaryOperator(llvm::Instruction::SDiv, delta, size,
+                                           "element_diff");
+            }
+            else {
+                // ptr - integer
+                llvm::Value *zero = lLLVMConstantValue(type1, g->ctx, 0.);
+                llvm::Value *negOffset = 
+                    ctx->BinaryOperator(llvm::Instruction::Sub, zero, value1, 
+                                        "negate");
+                // Do a GEP as ptr + -integer
+                return ctx->GetElementPtrInst(value0, negOffset, ptrType, 
+                                              "ptrmath");
+            }
+        }
+        default:
+            FATAL("Logic error in lEmitBinaryArith() for pointer type case");
+            return NULL;
+        }
     }
+    else {
+        assert(Type::EqualIgnoringConst(type0, type1));
 
-    return ctx->BinaryOperator(inst, e0Val, e1Val, "binop");
+        llvm::Instruction::BinaryOps inst;
+        bool isFloatOp = type0->IsFloatType();
+        bool isUnsignedOp = type0->IsUnsignedType();
+
+        switch (op) {
+        case BinaryExpr::Add:
+            inst = isFloatOp ? llvm::Instruction::FAdd : llvm::Instruction::Add;
+            break;
+        case BinaryExpr::Sub:
+            inst = isFloatOp ? llvm::Instruction::FSub : llvm::Instruction::Sub;
+            break;
+        case BinaryExpr::Mul:
+            inst = isFloatOp ? llvm::Instruction::FMul : llvm::Instruction::Mul;
+            break;
+        case BinaryExpr::Div:
+            if (type0->IsVaryingType() && !isFloatOp)
+                PerformanceWarning(pos, "Division with varying integer types is "
+                                   "very inefficient."); 
+            inst = isFloatOp ? llvm::Instruction::FDiv : 
+                (isUnsignedOp ? llvm::Instruction::UDiv : llvm::Instruction::SDiv);
+            break;
+        case BinaryExpr::Mod:
+            if (type0->IsVaryingType() && !isFloatOp)
+                PerformanceWarning(pos, "Modulus operator with varying types is "
+                                   "very inefficient."); 
+            inst = isFloatOp ? llvm::Instruction::FRem : 
+                (isUnsignedOp ? llvm::Instruction::URem : llvm::Instruction::SRem);
+            break;
+        default:
+            FATAL("Invalid op type passed to lEmitBinaryArith()");
+            return NULL;
+        }
+
+        return ctx->BinaryOperator(inst, value0, value1, "binop");
+    }
 }
 
 
@@ -1052,8 +1211,8 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!arg0 || !arg1)
         return NULL;
 
-    llvm::Value *e0Val = arg0->GetValue(ctx);
-    llvm::Value *e1Val = arg1->GetValue(ctx);
+    llvm::Value *value0 = arg0->GetValue(ctx);
+    llvm::Value *value1 = arg1->GetValue(ctx);
     ctx->SetDebugPos(pos);
 
     switch (op) {
@@ -1062,14 +1221,15 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
     case Mul:
     case Div:
     case Mod:
-        return lEmitBinaryArith(op, e0Val, e1Val, arg0->GetType(), ctx, pos);
+        return lEmitBinaryArith(op, value0, value1, arg0->GetType(), arg1->GetType(),
+                                ctx, pos);
     case Lt:
     case Gt:
     case Le:
     case Ge:
     case Equal:
     case NotEqual:
-        return lEmitBinaryCmp(op, e0Val, e1Val, arg0->GetType(), ctx, pos);
+        return lEmitBinaryCmp(op, value0, value1, arg0->GetType(), ctx, pos);
     case Shl:
     case Shr:
     case BitAnd:
@@ -1079,17 +1239,17 @@ BinaryExpr::GetValue(FunctionEmitContext *ctx) const {
             dynamic_cast<ConstExpr *>(arg1) == NULL)
             PerformanceWarning(pos, "Shift right is extremely inefficient for "
                                "varying shift amounts.");
-        return lEmitBinaryBitOp(op, e0Val, e1Val, 
+        return lEmitBinaryBitOp(op, value0, value1, 
                                 arg0->GetType()->IsUnsignedType(), ctx);
     }
     case LogicalAnd:
-        return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val,
+        return ctx->BinaryOperator(llvm::Instruction::And, value0, value1,
                                    "logical_and");
     case LogicalOr:
-        return ctx->BinaryOperator(llvm::Instruction::Or, e0Val, e1Val, 
+        return ctx->BinaryOperator(llvm::Instruction::Or, value0, value1, 
                                    "logical_or");
     case Comma:
-        return e1Val;
+        return value1;
     default:
         FATAL("logic error");
         return NULL;
@@ -1106,28 +1266,42 @@ BinaryExpr::GetType() const {
     if (type0 == NULL || type1 == NULL)
         return NULL;
 
-#if 0
-    // FIXME: I think these are redundant given the checks in
-    // BinaryExpr::TypeCheck().  They should either be removed or updated
-    // to handle the cases where pointer == and != tests are ok.
-    if (!type0->IsBoolType() && !type0->IsNumericType()) {
-        Error(arg0->pos, "First operand to binary operator \"%s\" is of invalid "
-              "type \"%s\".", lOpString(op), type0->GetString().c_str());
-        return NULL;
-    }
-    if (!type1->IsBoolType() && !type1->IsNumericType()) {
-        Error(arg1->pos,
-              "Second operand to binary operator \"%s\" is of invalid "
-              "type \"%s\".", lOpString(op), type1->GetString().c_str());
-        return NULL;
-    }
-#endif
+    // If this hits, it means that our TypeCheck() method hasn't been
+    // called before GetType() was called; adding two pointers is illegal
+    // and will fail type checking and (int + ptr) should be canonicalized
+    // into (ptr + int) by type checking.
+    if (op == Add)
+        assert(dynamic_cast<const PointerType *>(type1) == NULL);
 
-    const Type *promotedType = Type::MoreGeneralType(type0, type1, pos, 
-                                                     lOpString(op));
+    if (op == Comma)
+        return arg1->GetType();
+
+    if (dynamic_cast<const PointerType *>(type0) != NULL) {
+        if (op == Add)
+            // ptr + int -> ptr
+            return type0;
+        else if (op == Sub) {
+            if (dynamic_cast<const PointerType *>(type1) != NULL) {
+                // ptr - ptr -> ~ptrdiff_t
+                const Type *diffType = (g->target.is32Bit || 
+                                        g->opt.force32BitAddressing) ? 
+                    AtomicType::UniformInt32 : AtomicType::UniformInt64;
+                if (type0->IsVaryingType() || type1->IsVaryingType())
+                    diffType = diffType->GetAsVaryingType();
+                return diffType;
+            }
+            else
+                // ptr - int -> ptr
+                return type0;
+        }
+        // otherwise fall through for these two...
+        assert(op == Equal || op == NotEqual);
+    }
+
+    const Type *exprType = Type::MoreGeneralType(type0, type1, pos, lOpString(op));
     // I don't think that MoreGeneralType should be able to fail after the
-    // type checks above.
-    assert(promotedType != NULL);
+    // checks done in BinaryExpr::TypeCheck().
+    assert(exprType != NULL);
 
     switch (op) {
     case Add:
@@ -1135,7 +1309,7 @@ BinaryExpr::GetType() const {
     case Mul:
     case Div:
     case Mod:
-        return promotedType;
+        return exprType;
     case Lt:
     case Gt:
     case Le:
@@ -1144,16 +1318,16 @@ BinaryExpr::GetType() const {
     case NotEqual:
     case LogicalAnd:
     case LogicalOr:
-        return lMatchingBoolType(promotedType);
+        return lMatchingBoolType(exprType);
     case Shl:
     case Shr:
         return type1->IsVaryingType() ? type0->GetAsVaryingType() : type0;
     case BitAnd:
     case BitXor:
     case BitOr:
-        return promotedType;
+        return exprType;
     case Comma:
-        return arg1->GetType();
+        // handled above, so fall through here just in case
     default:
         FATAL("logic error in BinaryExpr::GetType()");
         return NULL;
@@ -1341,8 +1515,7 @@ BinaryExpr::Optimize() {
     if (constArg0 == NULL || constArg1 == NULL)
         return this;
 
-    assert(Type::Equal(arg0->GetType()->GetAsNonConstType(),
-                       arg1->GetType()->GetAsNonConstType()));
+    assert(Type::EqualIgnoringConst(arg0->GetType(), arg1->GetType()));
     const Type *type = arg0->GetType()->GetAsNonConstType();
     if (type == AtomicType::UniformFloat || type == AtomicType::VaryingFloat) {
         float v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC];
@@ -1439,6 +1612,70 @@ BinaryExpr::TypeCheck() {
         assert(type1 != NULL);
     }
 
+    const PointerType *pt0 = dynamic_cast<const PointerType *>(type0);
+    const PointerType *pt1 = dynamic_cast<const PointerType *>(type1);
+    if (pt0 != NULL && pt1 != NULL && op == Sub) {
+        if (PointerType::IsVoidPointer(type0)) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on \"%s\" type.", type0->GetString().c_str());
+            return NULL;
+        }
+        if (PointerType::IsVoidPointer(type1)) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on \"%s\" type.", type1->GetString().c_str());
+            return NULL;
+        }
+
+        const Type *t = Type::MoreGeneralType(type0, type1, pos, "-");
+        if (t == NULL)
+            return NULL;
+        arg0 = TypeConvertExpr(arg0, t, "pointer subtraction");
+        arg1 = TypeConvertExpr(arg1, t, "pointer subtraction");
+        if (arg0 == NULL || arg1 == NULL)
+            return NULL;
+
+        return this;
+    }
+    else if (((pt0 != NULL || pt1 != NULL) && op == Add) ||
+             (pt0 != NULL && op == Sub)) {
+        // Handle ptr + int, int + ptr, ptr - int
+        if (pt0 != NULL && pt1 != NULL) {
+            Error(pos, "Illegal to add two pointer types \"%s\" and \"%s\".",
+                  pt0->GetString().c_str(), pt1->GetString().c_str());
+            return NULL;
+        }
+        else if (pt1 != NULL) {
+            // put in canonical order with the pointer as the first operand
+            // for GetValue()
+            std::swap(arg0, arg1);
+            std::swap(pt0, pt1);
+        }
+
+        assert(pt0 != NULL);
+
+        if (PointerType::IsVoidPointer(pt0)) {
+            Error(pos, "Illegal to perform pointer arithmetic "
+                  "on \"%s\" type.", pt0->GetString().c_str());
+            return NULL;
+        }
+
+        const Type *offsetType = g->target.is32Bit ? 
+            AtomicType::UniformInt32 : AtomicType::UniformInt64;
+        if (pt0->IsVaryingType())
+            offsetType = offsetType->GetAsVaryingType();
+        if (type1->IsVaryingType()) {
+            arg0 = TypeConvertExpr(arg0, type0->GetAsVaryingType(), 
+                                   "pointer addition");
+            assert(arg0 != NULL);
+        }
+
+        arg1 = TypeConvertExpr(arg1, offsetType, lOpString(op));
+        if (arg1 == NULL)
+            return NULL;
+
+        return this;
+    }
+
     switch (op) {
     case Shl:
     case Shr:
@@ -1618,6 +1855,26 @@ BinaryExpr::Print() const {
 ///////////////////////////////////////////////////////////////////////////
 // AssignExpr
 
+static const char *
+lOpString(AssignExpr::Op op) {
+    switch (op) {
+    case AssignExpr::Assign:    return "=";
+    case AssignExpr::MulAssign: return "*=";
+    case AssignExpr::DivAssign: return "/=";
+    case AssignExpr::ModAssign: return "%%=";
+    case AssignExpr::AddAssign: return "+=";
+    case AssignExpr::SubAssign: return "-=";
+    case AssignExpr::ShlAssign: return "<<=";
+    case AssignExpr::ShrAssign: return ">>=";
+    case AssignExpr::AndAssign: return "&=";
+    case AssignExpr::XorAssign: return "^=";
+    case AssignExpr::OrAssign:  return "|=";
+    default:
+        FATAL("Missing op in lOpstring");
+        return "";
+    }
+}
+
 /** Emit code to do an "assignment + operation" operator, e.g. "+=".
  */
 static llvm::Value *
@@ -1630,13 +1887,16 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
         Error(pos, "Can't assign to left-hand side of expression.");
         return NULL;
     }
+    const Type *lvalueType = arg0->GetLValueType();
+    if (lvalueType == NULL)
+        return NULL;
 
     // Get the value on the right-hand side of the assignment+operation
     // operator and load the current value on the left-hand side.
     llvm::Value *rvalue = arg1->GetValue(ctx);
     ctx->SetDebugPos(pos);
     llvm::Value *mask = lMaskForSymbol(baseSym, ctx);
-    llvm::Value *oldLHS = ctx->LoadInst(lv, mask, type, "opassign_load");
+    llvm::Value *oldLHS = ctx->LoadInst(lv, mask, lvalueType, "opassign_load");
 
     // Map the operator to the corresponding BinaryExpr::Op operator
     BinaryExpr::Op basicop;
@@ -1664,7 +1924,8 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
     case AssignExpr::ModAssign:
     case AssignExpr::AddAssign:
     case AssignExpr::SubAssign:
-        newValue = lEmitBinaryArith(basicop, oldLHS, rvalue, type, ctx, pos);
+        newValue = lEmitBinaryArith(basicop, oldLHS, rvalue, type,
+                                    arg1->GetType(), ctx, pos);
         break;
     case AssignExpr::ShlAssign:
     case AssignExpr::ShrAssign:
@@ -1680,7 +1941,7 @@ lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type,
     }
 
     // And store the result back to the lvalue.
-    lStoreAssignResult(newValue, lv, type, ctx, baseSym);
+    lStoreAssignResult(newValue, lv, lvalueType, ctx, baseSym);
 
     return newValue;
 }
@@ -1703,7 +1964,7 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
 
 #if 0
     if (ctx->VaryingCFDepth() > 0 && type->IsUniformType())
-        Warning(pos, "Modifying \"uniform\" value under \"varying\" control flow.  Beware.");
+        Warning(pos, "Modifying \"uniform\" value under \"varying\" control flow.");
 #endif
 
     Symbol *baseSym = lvalue->GetBaseSymbol();
@@ -1717,15 +1978,21 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
     switch (op) {
     case Assign: {
         llvm::Value *lv = lvalue->GetLValue(ctx);
-        if (!lv) {
-            // FIXME: another, I believe, now unnecessary test?
-            Error(lvalue->pos, "Can't assign to left-hand side of expression.");
+        if (lv == NULL) {
+            assert(m->errorCount > 0);
+            return NULL;
+        }
+        const Type *lvalueType = lvalue->GetLValueType();
+        if (lvalueType == NULL) {
+            assert(m->errorCount > 0);
             return NULL;
         }
 
         llvm::Value *rv = rvalue->GetValue(ctx);
-        if (rv == NULL)
+        if (rv == NULL) {
+            assert(m->errorCount > 0);
             return NULL;
+        }
 
         ctx->SetDebugPos(pos);
 
@@ -1750,7 +2017,7 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
         }
 #endif
 
-        lStoreAssignResult(rv, lv, type, ctx, baseSym);
+        lStoreAssignResult(rv, lv, lvalueType, ctx, baseSym);
 
         return rv;
     }
@@ -1795,6 +2062,37 @@ AssignExpr::GetType() const {
 }
 
 
+/** Recursively checks a structure type to see if it (or any struct type
+    that it holds) has a const-qualified member. */
+static bool
+lCheckForConstStructMember(SourcePos pos, const StructType *structType,
+                           const StructType *initialType) {
+    for (int i = 0; i < structType->GetElementCount(); ++i) {
+        const Type *t = structType->GetElementType(i);
+        if (t->IsConstType()) {
+            if (structType == initialType)
+                Error(pos, "Illegal to assign to type \"%s\" due to element "
+                      "\"%s\" with type \"%s\".", structType->GetString().c_str(),
+                      structType->GetElementName(i).c_str(),
+                      t->GetString().c_str());
+            else
+                Error(pos, "Illegal to assign to type \"%s\" in type \"%s\" "
+                      "due to element \"%s\" with type \"%s\".", 
+                      structType->GetString().c_str(),
+                      initialType->GetString().c_str(), 
+                      structType->GetElementName(i).c_str(),
+                      t->GetString().c_str());
+            return true;
+        }
+
+        const StructType *st = dynamic_cast<const StructType *>(t);
+        if (st != NULL && lCheckForConstStructMember(pos, st, initialType))
+            return true;
+    }
+    return false;
+}
+
+
 Expr *
 AssignExpr::TypeCheck() {
     if (lvalue != NULL) 
@@ -1806,13 +2104,7 @@ AssignExpr::TypeCheck() {
 
     bool lvalueIsReference = 
         dynamic_cast<const ReferenceType *>(lvalue->GetType()) != NULL;
-    bool rvalueIsReference = 
-        dynamic_cast<const ReferenceType *>(rvalue->GetType()) != NULL;
-
-    // hack to allow asigning array references e.g. in a struct...
-    if (lvalueIsReference &&
-        !(rvalueIsReference && 
-          dynamic_cast<const ArrayType *>(rvalue->GetType()->GetReferenceTarget())))
+    if (lvalueIsReference)
         lvalue = new DereferenceExpr(lvalue, lvalue->pos);
 
     FunctionSymbolExpr *fse;
@@ -1828,23 +2120,58 @@ AssignExpr::TypeCheck() {
                   lvalue->GetType()->GetString().c_str());
             return NULL;
         }
-        if (!fse->ResolveOverloads(ftype->GetArgumentTypes())) {
+
+        std::vector<const Type *> paramTypes;
+        for (int i = 0; i < ftype->GetNumParameters(); ++i)
+            paramTypes.push_back(ftype->GetParameterType(i));
+
+        if (!fse->ResolveOverloads(paramTypes)) {
             Error(pos, "Unable to find overloaded function for function "
                   "pointer assignment.");
             return NULL;
         }
     }
 
-    rvalue = TypeConvertExpr(rvalue, lvalue->GetType(), "assignment");
+    const Type *lhsType = lvalue->GetType();
+    if (dynamic_cast<const PointerType *>(lhsType) != NULL) {
+        if (op == AddAssign || op == SubAssign) {
+            if (PointerType::IsVoidPointer(lhsType)) {
+                Error(pos, "Illegal to perform pointer arithmetic on \"%s\" "
+                      "type.", lhsType->GetString().c_str());
+                return NULL;
+            }
+
+            const Type *deltaType = g->target.is32Bit ? AtomicType::UniformInt32 :
+                AtomicType::UniformInt64;
+            if (lhsType->IsVaryingType())
+                deltaType = deltaType->GetAsVaryingType();
+            rvalue = TypeConvertExpr(rvalue, deltaType, lOpString(op));
+        }
+        else if (op == Assign)
+            rvalue = TypeConvertExpr(rvalue, lhsType, "assignment");
+        else {
+            Error(pos, "Assignment operator \"%s\" is illegal with pointer types.",
+                  lOpString(op));
+            return NULL;
+        }
+    }
+    else 
+        rvalue = TypeConvertExpr(rvalue, lhsType, lOpString(op));
+
     if (rvalue == NULL)
         return NULL;
 
-    if (lvalue->GetType()->IsConstType()) {
-        Error(pos, "Can't assign to type \"%s\" on left-hand size of "
-              "expression.", lvalue->GetType()->GetString().c_str());
+    if (lhsType->IsConstType()) {
+        Error(pos, "Can't assign to type \"%s\" on left-hand side of "
+              "expression.", lhsType->GetString().c_str());
         return NULL;
     }
 
+    // Make sure we're not assigning to a struct that has a constant member
+    const StructType *st = dynamic_cast<const StructType *>(lhsType);
+    if (st != NULL && lCheckForConstStructMember(pos, st, st))
+        return NULL;
+
     return this;
 }
 
@@ -1870,19 +2197,7 @@ AssignExpr::Print() const {
 
     printf("[%s] assign (", GetType()->GetString().c_str());
     lvalue->Print();
-    printf(" ");
-    if (op == Assign)    printf("=");
-    if (op == MulAssign) printf("*=");
-    if (op == DivAssign) printf("/=");
-    if (op == ModAssign) printf("%%=");
-    if (op == AddAssign) printf("+=");
-    if (op == SubAssign) printf("-=");
-    if (op == ShlAssign) printf("<<=");
-    if (op == ShrAssign) printf(">>=");
-    if (op == AndAssign) printf("&=");
-    if (op == XorAssign) printf("^=");
-    if (op == OrAssign)  printf("|=");
-    printf(" ");
+    printf(" %s ", lOpString(op));
     rvalue->Print();
     printf(")");
     pos.Print();
@@ -1911,8 +2226,10 @@ lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test,
     // Don't need to worry about masking here
     ctx->StoreInst(expr2, resultPtr);
     // Use masking to conditionally store the expr1 values
-    ctx->StoreInst(expr1, resultPtr, test, type);
-    return ctx->LoadInst(resultPtr, LLVMMaskAllOn, type, "selectexpr_final");
+    assert(resultPtr->getType() ==
+           PointerType::GetUniform(type)->LLVMType(g->ctx));
+    ctx->StoreInst(expr1, resultPtr, test, PointerType::GetUniform(type));
+    return ctx->LoadInst(resultPtr, "selectexpr_final");
 }
 
 
@@ -1996,15 +2313,15 @@ SelectExpr::GetValue(FunctionEmitContext *ctx) const {
         // Do an element-wise select  
         llvm::Value *result = llvm::UndefValue::get(type->LLVMType(g->ctx));
         for (int i = 0; i < vt->GetElementCount(); ++i) {
-            llvm::Value *ti = ctx->ExtractInst(testVal, i, "");
-            llvm::Value *e1i = ctx->ExtractInst(expr1Val, i, "");
-            llvm::Value *e2i = ctx->ExtractInst(expr2Val, i, "");
+            llvm::Value *ti = ctx->ExtractInst(testVal, i);
+            llvm::Value *e1i = ctx->ExtractInst(expr1Val, i);
+            llvm::Value *e2i = ctx->ExtractInst(expr2Val, i);
             llvm::Value *sel = NULL;
             if (testType->IsUniformType())
                 sel = ctx->SelectInst(ti, e1i, e2i);
             else
                 sel = lEmitVaryingSelect(ctx, ti, e1i, e2i, vt->GetElementType());
-            result = ctx->InsertInst(result, sel, i, "");
+            result = ctx->InsertInst(result, sel, i);
         }
         return result;
     }
@@ -2181,41 +2498,27 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
     // GetLValue call below needs a FunctionEmitContext, which is
     // problematic...)  
     std::vector<Expr *> callargs = args->exprs;
-    const std::vector<const Type *> &argTypes = ft->GetArgumentTypes();
     bool err = false;
+
+    // Specifically, this can happen if there's an error earlier during
+    // overload resolution.
+    if ((int)callargs.size() > ft->GetNumParameters()) {
+        assert(m->errorCount > 0);
+        return NULL;
+    }
+
     for (unsigned int i = 0; i < callargs.size(); ++i) {
         Expr *argExpr = callargs[i];
         if (argExpr == NULL)
             continue;
 
-        // All arrays should already have been converted to reference types
-        assert(dynamic_cast<const ArrayType *>(argTypes[i]) == NULL);
-
-        if (dynamic_cast<const ReferenceType *>(argTypes[i])) {
-            if (!dynamic_cast<const ReferenceType *>(argExpr->GetType())) {
-                // The function wants a reference type but the argument
-                // being passed isn't already a reference.
-                if (argExpr->GetLValue(ctx) == NULL) {
-                    // If it doesn't have an lvalue, then we can't make it
-                    // a reference, so issue an error.
-                    // FIXME: for const reference parameters, we could
-                    // store the expr's value to alloca'ed memory and then
-                    // pass a reference to that...
-                    Error(pos, "Can't pass non-lvalue as \"reference\" parameter \"%s\" "
-                          "to function.", ft->GetArgumentName(i).c_str());
-                    err = true;
-                }
-                else
-                    argExpr = new ReferenceExpr(argExpr, argExpr->pos);
-            }
-        }
+        const Type *paramType = ft->GetParameterType(i); 
 
         // Do whatever type conversion is needed
-        argExpr = TypeConvertExpr(argExpr, argTypes[i], 
+        argExpr = TypeConvertExpr(argExpr, paramType,
                                   "function call argument");
-        // The function overload resolution code should have ensured that
-        // we can successfully do any type conversions needed here.
-        assert(argExpr != NULL);
+        if (argExpr == NULL)
+            return NULL;
         callargs[i] = argExpr;
     }
     if (err)
@@ -2223,61 +2526,33 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
 
     // Fill in any default argument values needed.
     // FIXME: should we do this during type checking?
-    const std::vector<ConstExpr *> &argumentDefaults = ft->GetArgumentDefaults();
-    for (unsigned int i = callargs.size(); i < argumentDefaults.size(); ++i) {
-        Expr * d = TypeConvertExpr(argumentDefaults[i], argTypes[i],
-                                   "function call default argument");
+    for (int i = callargs.size(); i < ft->GetNumParameters(); ++i) {
+        Expr *paramDefault = ft->GetParameterDefault(i);
+        const Type *paramType = ft->GetParameterType(i);
+        // FIXME: this type conv should happen when we create the function
+        // type!
+        Expr *d = TypeConvertExpr(paramDefault, paramType,
+                                  "function call default argument");
         if (d == NULL)
             return NULL;
         callargs.push_back(d);
     }
 
-    // Now evaluate the values of all of the parameters being passed.  We
-    // need to evaluate these first here, since their GetValue() calls may
-    // change the current basic block (e.g. if one of these is itself a
-    // function call expr...); we need to basic blocks to stay consistent
-    // below when we emit the code that does the actual funciton call.
+    // Now evaluate the values of all of the parameters being passed.
     std::vector<llvm::Value *> argVals;
-    std::vector<llvm::Value *> storedArgValPtrs, argValLValues;
     for (unsigned int i = 0; i < callargs.size(); ++i) {
         Expr *argExpr = callargs[i];
-        if (!argExpr)
+        if (argExpr == NULL)
             // give up; we hit an error earlier
             return NULL;
 
         llvm::Value *argValue = argExpr->GetValue(ctx);
-        if (!argValue)
+        if (argValue == NULL)
             // something went wrong in evaluating the argument's
             // expression, so give up on this
             return NULL;
 
-        if (dynamic_cast<const ReferenceType *>(argTypes[i]) &&
-            !llvm::isa<const llvm::PointerType>(argValue->getType())) {
-            assert(llvm::isa<const llvm::ArrayType>(argValue->getType()));
-            // if the parameter is a reference and the lvalue needs a
-            // gather to pull it together, then do the gather here and
-            // store the result to local memory, so that we can pass the
-            // single pointer to the local memory that is needed for the
-            // reference.  Below, we'll copy the result back to the varying
-            // lvalue pointer we have here.  (== pass by value/result)
-            const ReferenceType *rt = 
-                dynamic_cast<const ReferenceType *>(argExpr->GetType());
-            assert(rt != NULL);
-            const Type *type = rt->GetReferenceTarget();
-
-            llvm::Value *ptr = ctx->AllocaInst(type->LLVMType(g->ctx), "arg");
-            llvm::Value *mask = lMaskForSymbol(argExpr->GetBaseSymbol(), ctx);
-            llvm::Value *val = ctx->LoadInst(argValue, mask, type);
-            ctx->StoreInst(val, ptr);
-            storedArgValPtrs.push_back(ptr);
-            argValLValues.push_back(argValue);
-            argVals.push_back(ptr);
-        }
-        else {
-            argVals.push_back(argValue);
-            storedArgValPtrs.push_back(NULL);
-            argValLValues.push_back(NULL);
-        }
+        argVals.push_back(argValue);
     }
 
 
@@ -2290,25 +2565,9 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const {
             ctx->LaunchInst(callee, argVals, launchCount);
     }
     else
-        retVal = ctx->CallInst(callee, ft->GetReturnType(), argVals, 
+        retVal = ctx->CallInst(callee, ft, argVals, 
                                isVoidFunc ? "" : "calltmp");
 
-    // For anything we had to do as pass by value/result, copy the
-    // corresponding reference values back out
-    for (unsigned int i = 0; i < storedArgValPtrs.size(); ++i) {
-        llvm::Value *ptr = storedArgValPtrs[i];
-        if (ptr != NULL) {
-            const ReferenceType *rt = 
-                dynamic_cast<const ReferenceType *>(callargs[i]->GetType());
-            assert(rt != NULL);
-            llvm::Value *load = ctx->LoadInst(ptr, NULL, rt->GetReferenceTarget(),
-                                              "load_ref");
-            Symbol *baseSym = callargs[i]->GetBaseSymbol();
-            lStoreAssignResult(load, argValLValues[i], rt->GetReferenceTarget(),
-                               ctx, baseSym);
-        }
-    }
-
     if (isVoidFunc)
         return NULL;
     else
@@ -2583,39 +2842,14 @@ ExprList::Print() const {
 
 IndexExpr::IndexExpr(Expr *a, Expr *i, SourcePos p) 
     : Expr(p) {
-    arrayOrVector = a;
+    baseExpr = a;
     index = i;
 }
 
 
-// FIXME: This is an ugly hack--if we're indexing into a uniform ispc
-// VectorType, then this bitcasts the corresponding llvm::VectorType value
-// to be a pointer to the vector's element type, so that a GEP to index
-// from the pointer indices elements of the llvm::VectorType and doesn't
-// incorrectly try to index into an array of llvm::VectorType instances.
-
-static llvm::Value *
-lCastUniformVectorBasePtr(llvm::Value *ptr, FunctionEmitContext *ctx) {
-    LLVM_TYPE_CONST llvm::PointerType *baseType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ptr->getType());
-    if (!baseType)
-        return ptr;
-
-    LLVM_TYPE_CONST llvm::VectorType *baseEltVecType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(baseType->getElementType());
-    if (!baseEltVecType)
-        return ptr;
-
-    LLVM_TYPE_CONST llvm::Type *vecEltType = baseEltVecType->getElementType();
-    int numElts = baseEltVecType->getNumElements();
-    LLVM_TYPE_CONST llvm::Type *castType = 
-        llvm::PointerType::get(llvm::ArrayType::get(vecEltType, numElts), 0);
-    return ctx->BitCastInst(ptr, castType);
-}
-
-
 /** When computing pointer values, we need to apply a per-lane offset when
-    we're indexing into varying data.  Consdier the following ispc code:
+    we have a varying pointer that is itself indexing into varying data.
+    Consdier the following ispc code:
 
     uniform float u[] = ...;
     float v[] = ...;
@@ -2635,38 +2869,23 @@ lCastUniformVectorBasePtr(llvm::Value *ptr, FunctionEmitContext *ctx) {
  */ 
 static llvm::Value *
 lAddVaryingOffsetsIfNeeded(FunctionEmitContext *ctx, llvm::Value *ptr, 
-                           const Type *returnType, const Type *indexedType) {
-    // If the result of the indexing isn't a varying atomic type, then
-    // nothing to do here.
-    if (returnType->IsVaryingType() == false ||
-        (dynamic_cast<const AtomicType *>(returnType) == NULL &&
-         dynamic_cast<const PointerType *>(returnType) == NULL))
+                           const Type *ptrType) {
+    if (dynamic_cast<const ReferenceType *>(ptrType) != NULL)
+        // References are uniform pointers, so no offsetting is needed
         return ptr;
 
-    // We should now have an array of pointer values, represing in a
-    // varying pointer.
-    LLVM_TYPE_CONST llvm::ArrayType *at = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(ptr->getType());
-    if (at == NULL)
-        return ptr;
-    LLVM_TYPE_CONST llvm::PointerType *pt =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(at->getElementType());
-    assert(pt != NULL);
-
-    // If the pointers are to uniform types (e.g. ptr->getType() == 
-    // [8 x float *]), then we have the u[index] situation from the comment
-    // above, and no additional offset is needed.  Otherwise we have
-    // pointers to varying atomic types--e.g. ptr->getType() == 
-    // [8 x <8 x float> *]
-    if (pt->getElementType()->isIntegerTy() || 
-        pt->getElementType()->isFloatingPointTy() ||
-        pt->getElementType()->isPointerTy())
+    assert(dynamic_cast<const PointerType *>(ptrType) != NULL);
+    if (ptrType->IsUniformType())
         return ptr;
 
-    // But not so fast: if the reason we have a vector of pointers is that
-    // we're indexing into an array of uniform short-vector types, then we
-    // don't need the offsets.
-    if (dynamic_cast<const VectorType *>(indexedType) != NULL)
+    const Type *baseType = ptrType->GetBaseType();
+    if (baseType->IsUniformType())
+        return ptr;
+
+    // must be indexing into varying atomic, enum, or pointer types
+    if (dynamic_cast<const AtomicType *>(baseType) == NULL &&
+        dynamic_cast<const EnumType *>(baseType) == NULL &&
+        dynamic_cast<const PointerType *>(baseType) == NULL)
         return ptr;
 
     // Onward: compute the per lane offsets.
@@ -2676,46 +2895,52 @@ lAddVaryingOffsetsIfNeeded(FunctionEmitContext *ctx, llvm::Value *ptr,
         varyingOffsets = ctx->InsertInst(varyingOffsets, LLVMInt32(i), i,
                                          "varying_delta");
 
-    // Cast the pointer to the corresponding uniform pointer
-    // type--e.g. from [8 x <8 x float> *] to [8 x float *].
-    LLVM_TYPE_CONST llvm::Type *unifType = 
-        returnType->GetAsUniformType()->LLVMType(g->ctx);
-    LLVM_TYPE_CONST llvm::PointerType *ptrCastType = 
-        llvm::PointerType::get(llvm::ArrayType::get(unifType, 0), 0);
-    ptr = ctx->BitCastInst(ptr, ptrCastType, "ptr2unif");
-
-    // And finally add the per-lane offsets.
-    return ctx->GetElementPtrInst(ptr, LLVMInt32(0), varyingOffsets);
+    // And finally add the per-lane offsets.  Note that we lie to the GEP
+    // call and tell it that the pointers are to uniform elements and not
+    // varying elements, so that the offsets in terms of (0,1,2,...) will
+    // end up turning into the correct step in bytes...
+    const Type *uniformElementType = baseType->GetAsUniformType();
+    const Type *ptrUnifType = PointerType::GetVarying(uniformElementType);
+    return ctx->GetElementPtrInst(ptr, varyingOffsets, ptrUnifType);
 }
 
 
 llvm::Value *
 IndexExpr::GetValue(FunctionEmitContext *ctx) const {
-    const Type *arrayOrVectorType;
-    if (arrayOrVector == NULL || index == NULL || 
-        ((arrayOrVectorType = arrayOrVector->GetType()) == NULL))
+    const Type *baseExprType;
+    if (baseExpr == NULL || index == NULL || 
+        ((baseExprType = baseExpr->GetType()) == NULL))
         return NULL;
 
     ctx->SetDebugPos(pos);
+
     llvm::Value *lvalue = GetLValue(ctx);
     llvm::Value *mask = NULL;
+    const Type *lvalueType = GetLValueType();
     if (lvalue == NULL) {
         // We may be indexing into a temporary that hasn't hit memory, so
         // get the full value and stuff it into temporary alloca'd space so
         // that we can index from there...
-        llvm::Value *val = arrayOrVector->GetValue(ctx);
+        llvm::Value *val = baseExpr->GetValue(ctx);
         if (val == NULL) {
             assert(m->errorCount > 0);
             return NULL;
         }
         ctx->SetDebugPos(pos);
-        llvm::Value *ptr = ctx->AllocaInst(arrayOrVectorType->LLVMType(g->ctx), 
+        llvm::Value *ptr = ctx->AllocaInst(baseExprType->LLVMType(g->ctx), 
                                            "array_tmp");
         ctx->StoreInst(val, ptr);
-        ptr = lCastUniformVectorBasePtr(ptr, ctx);
-        lvalue = ctx->GetElementPtrInst(ptr, LLVMInt32(0), index->GetValue(ctx));
-        lvalue = lAddVaryingOffsetsIfNeeded(ctx, lvalue, GetType(), 
-                                            arrayOrVectorType);
+
+        lvalue = ctx->GetElementPtrInst(ptr, LLVMInt32(0), index->GetValue(ctx),
+                                        PointerType::GetUniform(baseExprType));
+
+        const SequentialType *st = 
+            dynamic_cast<const SequentialType *>(baseExprType);
+        assert(st != NULL);
+        lvalueType = PointerType::GetUniform(st->GetElementType());
+
+        lvalue = lAddVaryingOffsetsIfNeeded(ctx, lvalue, lvalueType);
+                                            
         mask = LLVMMaskAllOn;
     }
     else {
@@ -2725,58 +2950,82 @@ IndexExpr::GetValue(FunctionEmitContext *ctx) const {
     }
 
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(lvalue, mask, GetType(), "index");
+    return ctx->LoadInst(lvalue, mask, lvalueType, "index");
 }
 
 
 const Type *
 IndexExpr::GetType() const {
-    const Type *arrayOrVectorType, *indexType;
-    if (!arrayOrVector || !index || 
-        ((arrayOrVectorType = arrayOrVector->GetType()) == NULL) ||
+    const Type *baseExprType, *indexType;
+    if (!baseExpr || !index || 
+        ((baseExprType = baseExpr->GetType()) == NULL) ||
         ((indexType = index->GetType()) == NULL))
         return NULL;
 
-    const SequentialType *sequentialType = 
-        dynamic_cast<const SequentialType *>(arrayOrVectorType->GetReferenceTarget());
-    // Typechecking should have caught this...
-    assert(sequentialType != NULL);
+    const Type *elementType = NULL;
+    const PointerType *pointerType = 
+        dynamic_cast<const PointerType *>(baseExprType);
+    if (pointerType != NULL)
+        // ptr[index] -> type that the pointer points to
+        elementType = pointerType->GetBaseType();
+    else {
+        // sequential type[index] -> element type of the sequential type
+        const SequentialType *sequentialType = 
+            dynamic_cast<const SequentialType *>(baseExprType->GetReferenceTarget());
+        // Typechecking should have caught this...
+        assert(sequentialType != NULL);
+        elementType = sequentialType->GetElementType();
+    }
 
-    const Type *elementType = sequentialType->GetElementType();
     if (indexType->IsUniformType())
         // If the index is uniform, the resulting type is just whatever the
         // element type is
         return elementType;
     else
-        // A varying index into uniform array/vector -> varying type (and
-        // same for varying array of course...)
+        // A varying index into even a uniform base type -> varying type
         return elementType->GetAsVaryingType();
 }
 
 
 Symbol *
 IndexExpr::GetBaseSymbol() const {
-    return arrayOrVector ? arrayOrVector->GetBaseSymbol() : NULL;
+    return baseExpr ? baseExpr->GetBaseSymbol() : NULL;
 }
 
 
 llvm::Value *
 IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
-    const Type *arrayOrVectorType;
-    if (arrayOrVector == NULL || index == NULL || 
-        ((arrayOrVectorType = arrayOrVector->GetType()) == NULL))
+    const Type *baseExprType;
+    if (baseExpr == NULL || index == NULL || 
+        ((baseExprType = baseExpr->GetType()) == NULL))
         return NULL;
 
     ctx->SetDebugPos(pos);
+    if (dynamic_cast<const PointerType *>(baseExprType) != NULL) {
+        // We're indexing off of a base pointer 
+        llvm::Value *baseValue = baseExpr->GetValue(ctx);
+        llvm::Value *indexValue = index->GetValue(ctx);
+        if (baseValue == NULL || indexValue == NULL)
+            return NULL;
+        ctx->SetDebugPos(pos);
+        return ctx->GetElementPtrInst(baseValue, indexValue,
+                                      baseExprType, "ptr_offset");
+    }
+
+    // Otherwise it's an array or vector
     llvm::Value *basePtr = NULL;
-    if (dynamic_cast<const ArrayType *>(arrayOrVectorType) ||
-        dynamic_cast<const VectorType *>(arrayOrVectorType))
-        basePtr = arrayOrVector->GetLValue(ctx);
+    const Type *basePtrType = NULL;
+    if (dynamic_cast<const ArrayType *>(baseExprType) ||
+        dynamic_cast<const VectorType *>(baseExprType)) {
+        basePtr = baseExpr->GetLValue(ctx);
+        basePtrType = baseExpr->GetLValueType();
+    }
     else {
-        arrayOrVectorType = arrayOrVectorType->GetReferenceTarget();
-        assert(dynamic_cast<const ArrayType *>(arrayOrVectorType) ||
-               dynamic_cast<const VectorType *>(arrayOrVectorType));
-        basePtr = arrayOrVector->GetValue(ctx);
+        baseExprType = baseExprType->GetReferenceTarget();
+        assert(dynamic_cast<const ArrayType *>(baseExprType) ||
+               dynamic_cast<const VectorType *>(baseExprType));
+        basePtr = baseExpr->GetValue(ctx);
+        basePtrType = baseExpr->GetType();
     }
     if (!basePtr)
         return NULL;
@@ -2785,37 +3034,71 @@ IndexExpr::GetLValue(FunctionEmitContext *ctx) const {
     // may lead to an out-of-bounds access.
     ConstExpr *ce = dynamic_cast<ConstExpr *>(index);
     const SequentialType *seqType = 
-        dynamic_cast<const SequentialType *>(arrayOrVectorType);
-    assert(seqType != NULL);
-    int nElements = seqType->GetElementCount();
-    if (ce != NULL && nElements > 0) {
-        int32_t indices[ISPC_MAX_NVEC];
-        int count = ce->AsInt32(indices);
-        for (int i = 0; i < count; ++i) {
-            if (indices[i] < 0 || indices[i] >= nElements)
-                Warning(index->pos, "Array index \"%d\" may be out of bounds for "
-                        "\"%d\" element array.", indices[i], nElements);
+        dynamic_cast<const SequentialType *>(baseExprType);
+    if (seqType != NULL) {
+        int nElements = seqType->GetElementCount();
+        if (ce != NULL && nElements > 0) {
+            int32_t indices[ISPC_MAX_NVEC];
+            int count = ce->AsInt32(indices);
+            for (int i = 0; i < count; ++i) {
+                if (indices[i] < 0 || indices[i] >= nElements)
+                    Warning(index->pos, "Array index \"%d\" may be out of bounds for "
+                            "\"%d\" element array.", indices[i], nElements);
+            }
         }
     }
 
-    basePtr = lCastUniformVectorBasePtr(basePtr, ctx);
-
     ctx->SetDebugPos(pos);
-    llvm::Value *ptr = ctx->GetElementPtrInst(basePtr, LLVMInt32(0), 
-                                              index->GetValue(ctx));
-    ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetType(), arrayOrVectorType);
-
+    llvm::Value *ptr = 
+        ctx->GetElementPtrInst(basePtr, LLVMInt32(0), index->GetValue(ctx),
+                               basePtrType);
+    ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
     return ptr;
 }
 
 
+const Type *
+IndexExpr::GetLValueType() const {
+    const Type *baseExprLValueType, *indexType;
+    if (baseExpr == NULL || index == NULL || 
+        ((baseExprLValueType = baseExpr->GetLValueType()) == NULL) ||
+        ((indexType = index->GetType()) == NULL))
+        return NULL;
+
+    if (dynamic_cast<const ReferenceType *>(baseExprLValueType) != NULL)
+        baseExprLValueType = PointerType::GetUniform(baseExprLValueType->GetReferenceTarget());
+    assert(dynamic_cast<const PointerType *>(baseExprLValueType) != NULL);
+
+    // FIXME: can we do something in the type system that unifies the
+    // concept of a sequential type's element type and a pointer type's
+    // base type?  The code below is identical but for handling that
+    // difference.  IndexableType?
+    const SequentialType *st = 
+        dynamic_cast<const SequentialType *>(baseExprLValueType->GetBaseType());
+    if (st != NULL) {
+        if (baseExprLValueType->IsUniformType() && indexType->IsUniformType())
+            return PointerType::GetUniform(st->GetElementType());
+        else
+            return PointerType::GetVarying(st->GetElementType());
+    }
+
+    const PointerType *pt = 
+        dynamic_cast<const PointerType *>(baseExprLValueType->GetBaseType());
+    assert(pt != NULL);
+    if (baseExprLValueType->IsUniformType() && indexType->IsUniformType())
+        return PointerType::GetUniform(pt->GetBaseType());
+    else
+        return PointerType::GetVarying(pt->GetBaseType());
+}
+
+
 Expr *
 IndexExpr::Optimize() {
-    if (arrayOrVector) 
-        arrayOrVector = arrayOrVector->Optimize();
+    if (baseExpr) 
+        baseExpr = baseExpr->Optimize();
     if (index) 
         index = index->Optimize();
-    if (arrayOrVector == NULL || index == NULL)
+    if (baseExpr == NULL || index == NULL)
         return NULL;
 
     return this;
@@ -2824,21 +3107,22 @@ IndexExpr::Optimize() {
 
 Expr *
 IndexExpr::TypeCheck() {
-    if (arrayOrVector) 
-        arrayOrVector = arrayOrVector->TypeCheck();
+    if (baseExpr) 
+        baseExpr = baseExpr->TypeCheck();
     if (index) 
         index = index->TypeCheck();
     
-    if (!arrayOrVector || !index || !index->GetType())
+    if (!baseExpr || !index || !index->GetType())
         return NULL;
 
-    const Type *arrayOrVectorType = arrayOrVector->GetType();
-    if (!arrayOrVectorType)
+    const Type *baseExprType = baseExpr->GetType();
+    if (!baseExprType)
         return NULL;
 
-    if (dynamic_cast<const SequentialType *>(arrayOrVectorType->GetReferenceTarget()) == NULL) {
-        Error(pos, "Trying to index into non-array or vector type \"%s\".", 
-              arrayOrVectorType->GetString().c_str());
+    if (!dynamic_cast<const SequentialType *>(baseExprType->GetReferenceTarget()) &&
+        !dynamic_cast<const PointerType *>(baseExprType)) {
+        Error(pos, "Trying to index into non-array, vector, or pointer "
+              "type \"%s\".", baseExprType->GetString().c_str());
         return NULL;
     }
 
@@ -2856,8 +3140,17 @@ IndexExpr::TypeCheck() {
 
 int
 IndexExpr::EstimateCost() const {
-    // be pessimistic
-    if (index && index->GetType()->IsVaryingType())
+    if (index == NULL || baseExpr == NULL)
+        return 0;
+
+    const Type *indexType = index->GetType();
+    const Type *baseExprType = baseExpr->GetType();
+    
+    if ((indexType != NULL && indexType->IsVaryingType()) ||
+        (dynamic_cast<const PointerType *>(baseExprType) != NULL &&
+         baseExprType->IsVaryingType()))
+        // be pessimistic; some of these will later turn out to be vector
+        // loads/stores, but it's too early for us to know that here.
         return COST_GATHER;
     else
         return COST_LOAD;
@@ -2866,11 +3159,11 @@ IndexExpr::EstimateCost() const {
 
 void
 IndexExpr::Print() const {
-    if (!arrayOrVector || !index || !GetType())
+    if (!baseExpr || !index || !GetType())
         return;
 
     printf("[%s] index ", GetType()->GetString().c_str());
-    arrayOrVector->Print();
+    baseExpr->Print();
     printf("[");
     index->Print();
     printf("]");
@@ -2913,49 +3206,58 @@ class StructMemberExpr : public MemberExpr
 {
 public:
     StructMemberExpr(Expr *e, const char *id, SourcePos p,
-                     SourcePos idpos, const StructType *structType);
+                     SourcePos idpos, bool derefLValue);
 
     const Type *GetType() const;
     int getElementNumber() const;
     const Type *getElementType() const;
 
 private:
-    const StructType *exprStructType;
+    const StructType *getStructType() const;
 };
 
 
 StructMemberExpr::StructMemberExpr(Expr *e, const char *id, SourcePos p,
-                                   SourcePos idpos,
-                                   const StructType *structType)
-    : MemberExpr(e, id, p, idpos), exprStructType(structType) {
+                                   SourcePos idpos, bool derefLValue)
+    : MemberExpr(e, id, p, idpos, derefLValue) {
 }
 
 
 const Type *
 StructMemberExpr::GetType() const {
-    // It's a struct, and the result type is the element
-    // type, possibly promoted to varying if the struct type / lvalue
-    // is varying.
-    const Type *elementType = exprStructType->GetElementType(identifier);
-    if (!elementType)
+    // It's a struct, and the result type is the element type, possibly
+    // promoted to varying if the struct type / lvalue is varying.
+    const StructType *structType = getStructType();
+    if (structType == NULL)
+        return NULL;
+
+    const Type *elementType = structType->GetElementType(identifier);
+    if (elementType == NULL)
         Error(identifierPos,
               "Element name \"%s\" not present in struct type \"%s\".%s",
-              identifier.c_str(), exprStructType->GetString().c_str(),
+              identifier.c_str(), structType->GetString().c_str(),
               getCandidateNearMatches().c_str());
 
-    if (exprStructType->IsVaryingType()) 
+    const PointerType *pt = dynamic_cast<const PointerType *>(expr->GetType());
+    if (structType->IsVaryingType() ||
+        (pt != NULL && pt->IsVaryingType()))
         return elementType->GetAsVaryingType();
     else
         return elementType;
 }
 
+
 int
 StructMemberExpr::getElementNumber() const {
-    int elementNumber = exprStructType->GetElementNumber(identifier);
+    const StructType *structType = getStructType();
+    if (structType == NULL)
+        return -1;
+
+    int elementNumber = structType->GetElementNumber(identifier);
     if (elementNumber == -1)
         Error(identifierPos,
               "Element name \"%s\" not present in struct type \"%s\".%s",
-              identifier.c_str(), exprStructType->GetString().c_str(),
+              identifier.c_str(), structType->GetString().c_str(),
               getCandidateNearMatches().c_str());
     return elementNumber;
 }
@@ -2963,7 +3265,34 @@ StructMemberExpr::getElementNumber() const {
 
 const Type *
 StructMemberExpr::getElementType() const {
-    return exprStructType->GetAsUniformType()->GetElementType(identifier);
+    const StructType *structType = getStructType();
+    if (structType == NULL)
+        return NULL;
+
+    return structType->GetAsUniformType()->GetElementType(identifier);
+}
+
+
+const StructType *
+StructMemberExpr::getStructType() const {
+    const Type *exprType = expr->GetType();
+    if (exprType == NULL)
+        return NULL;
+    
+    const StructType *structType = dynamic_cast<const StructType *>(exprType);
+    if (structType == NULL) {
+        const PointerType *pt = dynamic_cast<const PointerType *>(exprType);
+        if (pt != NULL)
+            structType = dynamic_cast<const StructType *>(pt->GetBaseType());
+        else {
+            const ReferenceType *rt = 
+                dynamic_cast<const ReferenceType *>(exprType);
+            assert(rt != NULL);
+            structType = dynamic_cast<const StructType *>(rt->GetReferenceTarget());
+        }
+        assert(structType != NULL);
+    }
+    return structType;
 }
 
 
@@ -2974,11 +3303,12 @@ class VectorMemberExpr : public MemberExpr
 {
 public:
     VectorMemberExpr(Expr *e, const char *id, SourcePos p,
-                     SourcePos idpos, const VectorType* vectorType);
+                     SourcePos idpos, bool derefLValue);
 
-    const Type *GetType() const;
-    llvm::Value *GetLValue(FunctionEmitContext* ctx) const;
     llvm::Value *GetValue(FunctionEmitContext* ctx) const;
+    llvm::Value *GetLValue(FunctionEmitContext* ctx) const;
+    const Type *GetType() const;
+    const Type *GetLValueType() const;
 
     int getElementNumber() const;
     const Type *getElementType() const;
@@ -2990,9 +3320,21 @@ private:
 
 
 VectorMemberExpr::VectorMemberExpr(Expr *e, const char *id, SourcePos p,
-                                   SourcePos idpos,
-                                   const VectorType* vectorType)
-    : MemberExpr(e, id, p, idpos), exprVectorType(vectorType) {
+                                   SourcePos idpos, bool derefLValue)
+    : MemberExpr(e, id, p, idpos, derefLValue) {
+    const Type *exprType = e->GetType();
+    exprVectorType = dynamic_cast<const VectorType *>(exprType);
+    if (exprVectorType == NULL) {
+        const PointerType *pt = dynamic_cast<const PointerType *>(exprType);
+        if (pt != NULL)
+            exprVectorType = dynamic_cast<const VectorType *>(pt->GetBaseType());
+        else {
+            assert(dynamic_cast<const ReferenceType *>(exprType) != NULL);
+            exprVectorType = 
+                dynamic_cast<const VectorType *>(exprType->GetReferenceTarget());
+        }
+        assert(exprVectorType != NULL);
+    }
     memberType = new VectorType(exprVectorType->GetElementType(),
                                 identifier.length());
 }
@@ -3004,11 +3346,14 @@ VectorMemberExpr::GetType() const {
     // type.  For n-element expressions, we have a shortvec type
     // with n > 1 elements.  This can be changed when we get
     // type<1> -> type conversions.
-    if (identifier.length() == 1) {
-        return exprVectorType->GetElementType();
-    } else {
-        return memberType;
-    }
+    const Type *type = (identifier.length() == 1) ? 
+        (const Type *)exprVectorType->GetElementType() : 
+        (const Type *)memberType;
+
+    const Type *lvalueType = GetLValueType();
+    if (lvalueType != NULL && lvalueType->IsVaryingType())
+        type = type->GetAsVaryingType();
+    return type;
 }
 
 
@@ -3022,8 +3367,41 @@ VectorMemberExpr::GetLValue(FunctionEmitContext* ctx) const {
 }
 
 
+const Type *
+VectorMemberExpr::GetLValueType() const {
+    if (identifier.length() == 1) {
+        if (expr == NULL)
+            return NULL;
+
+        const Type *exprLValueType = dereferenceExpr ? expr->GetType() :
+            expr->GetLValueType();
+        if (exprLValueType == NULL)
+            return NULL;
+
+        const VectorType *vt = NULL;
+        if (dynamic_cast<const ReferenceType *>(exprLValueType) != NULL)
+            vt = dynamic_cast<const VectorType *>(exprLValueType->GetReferenceTarget());
+        else
+            vt = dynamic_cast<const VectorType *>(exprLValueType->GetBaseType());
+        assert(vt != NULL);
+
+        // we don't want to report that it's e.g. a pointer to a float<1>,
+        // but ta pointer to a float, etc.
+        const Type *elementType = vt->GetElementType();
+        if (dynamic_cast<const ReferenceType *>(exprLValueType) != NULL)
+            return new ReferenceType(elementType);
+        else
+            return exprLValueType->IsUniformType() ?
+                PointerType::GetUniform(elementType) : 
+                PointerType::GetVarying(elementType);
+    }
+    else
+        return NULL;
+}
+
+
 llvm::Value *
-VectorMemberExpr::GetValue(FunctionEmitContext* ctx) const {
+VectorMemberExpr::GetValue(FunctionEmitContext *ctx) const {
     if (identifier.length() == 1) {
         return MemberExpr::GetValue(ctx);
     } 
@@ -3040,27 +3418,47 @@ VectorMemberExpr::GetValue(FunctionEmitContext* ctx) const {
             indices.push_back(idx);
         }
 
-        llvm::Value *basePtr = expr->GetLValue(ctx);
-        if (basePtr == NULL) {
+        llvm::Value *basePtr = NULL;
+        const Type *basePtrType = NULL;
+        if (dereferenceExpr) {
+            basePtr = expr->GetValue(ctx);
+            basePtrType = expr->GetType();
+        }
+        else {
+            basePtr = expr->GetLValue(ctx);
+            basePtrType = expr->GetLValueType();
+        }
+
+        if (basePtr == NULL || basePtrType == NULL) {
             assert(m->errorCount > 0);
             return NULL;
         }
-        llvm::Value *ltmp = ctx->AllocaInst(memberType->LLVMType(g->ctx), 
+
+        // Allocate temporary memory to tore the result
+        llvm::Value *resultPtr = ctx->AllocaInst(memberType->LLVMType(g->ctx), 
                                             "vector_tmp");
 
+        // FIXME: we should be able to use the internal mask here according
+        // to the same logic where it's used elsewhere
+        llvm::Value *elementMask = ctx->GetFullMask();
+
+        const Type *elementPtrType = basePtrType->IsUniformType() ? 
+            PointerType::GetUniform(exprVectorType->GetElementType()) :
+            PointerType::GetVarying(exprVectorType->GetElementType());
+
         ctx->SetDebugPos(pos);
         for (size_t i = 0; i < identifier.size(); ++i) {
-            llvm::Value *ptmp =
-                ctx->GetElementPtrInst(ltmp, 0, i, "new_offset");
-            llvm::Value *initLValue =
-                ctx->GetElementPtrInst(basePtr, 0, indices[i], "orig_offset");
-            llvm::Value *initValue =
-                ctx->LoadInst(initLValue, NULL, memberType->GetElementType(),
+            llvm::Value *elementPtr = ctx->AddElementOffset(basePtr, indices[i],
+                                                            basePtrType);
+            llvm::Value *elementValue = 
+                ctx->LoadInst(elementPtr, elementMask, elementPtrType, 
                               "vec_element");
-            ctx->StoreInst(initValue, ptmp);
+
+            llvm::Value *ptmp = ctx->AddElementOffset(resultPtr, i, NULL);
+            ctx->StoreInst(elementValue, ptmp);
         }
 
-        return ctx->LoadInst(ltmp, NULL, memberType, "swizzle_vec");
+        return ctx->LoadInst(resultPtr, "swizzle_vec");
     }
 }
 
@@ -3081,130 +3479,63 @@ VectorMemberExpr::getElementType() const {
 }
 
 
-///////////////////////////////////////////////////////////////////////////
-// ReferenceMemberExpr
-
-class ReferenceMemberExpr : public MemberExpr
-{
-public:
-    ReferenceMemberExpr(Expr *e, const char *id, SourcePos p,
-                        SourcePos idpos, const ReferenceType* referenceType);
-
-    const Type *GetType() const;
-    llvm::Value *GetLValue(FunctionEmitContext* ctx) const;
-
-    int getElementNumber() const;
-    const Type *getElementType() const;
-
-private:
-    const ReferenceType *exprReferenceType;
-    MemberExpr *dereferencedExpr;
-};
-
-ReferenceMemberExpr::ReferenceMemberExpr(Expr *e, const char *id, SourcePos p,
-                                         SourcePos idpos,
-                                         const ReferenceType *referenceType)
-    : MemberExpr(e, id, p, idpos), exprReferenceType(referenceType) {
-    const Type *refTarget = exprReferenceType->GetReferenceTarget();
-    const StructType *structType = dynamic_cast<const StructType *>(refTarget);
-    const VectorType *vectorType = dynamic_cast<const VectorType *>(refTarget);
-
-    if (structType != NULL)
-        dereferencedExpr = new StructMemberExpr(e, id, p, idpos, structType);
-    else if (vectorType != NULL)
-        dereferencedExpr = new VectorMemberExpr(e, id, p, idpos, vectorType);
-    else
-        dereferencedExpr = NULL;
-}
-
-
-const Type *
-ReferenceMemberExpr::GetType() const {
-    if (dereferencedExpr == NULL) {
-        Error(pos, "Can't access member of non-struct/vector type \"%s\".",
-              exprReferenceType->GetString().c_str());
-        return NULL;
-    } else {
-        return dereferencedExpr->GetType();
-    }
-}
-
-
-int
-ReferenceMemberExpr::getElementNumber() const {
-    if (dereferencedExpr == NULL) {
-        // FIXME: I think we shouldn't ever get here and that
-        // typechecking should have caught this case
-        return -1;
-    } else {
-        return dereferencedExpr->getElementNumber();
-    }
-}
-
-
-const Type *
-ReferenceMemberExpr::getElementType() const {
-    assert(dereferencedExpr != NULL);
-    return dereferencedExpr->getElementType();
-}
-
-
-llvm::Value *
-ReferenceMemberExpr::GetLValue(FunctionEmitContext* ctx) const {
-    if (dereferencedExpr == NULL) {
-        // FIXME: again I think typechecking should have caught this
-        Error(pos, "Can't access member of non-struct/vector type \"%s\".",
-              exprReferenceType->GetString().c_str());
-        return NULL;
-    }
-
-    //FIXME: Minor Code-dup...this is the same as the base, except
-    // llvm::Value *basePtr = expr->GetLValue instead of expr->getValue
-    llvm::Value *basePtr = expr->GetValue(ctx);
-    if (!basePtr)
-        return NULL;
-
-    int elementNumber = getElementNumber();
-    if (elementNumber == -1)
-        return NULL;
-
-    ctx->SetDebugPos(pos);
-    llvm::Value *ptr = ctx->GetElementPtrInst(basePtr, 0, elementNumber);
-
-    const Type *elementType = getElementType();
-    ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetType(), elementType);
-
-    return ptr;
-}
-
 
 MemberExpr *
-MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos) {
+MemberExpr::create(Expr *e, const char *id, SourcePos p, SourcePos idpos,
+                   bool derefLValue) {
     const Type *exprType;
     if (e == NULL || (exprType = e->GetType()) == NULL)
         return NULL;
 
-    const StructType *structType = dynamic_cast<const StructType*>(exprType);
-    if (structType != NULL)
-        return new StructMemberExpr(e, id, p, idpos, structType);
+    const ReferenceType *referenceType =
+        dynamic_cast<const ReferenceType *>(exprType);
+    if (referenceType != NULL) {
+        e = new DereferenceExpr(e, e->pos);
+        exprType = e->GetType();
+        assert(exprType != NULL);
+    }
 
-    const VectorType *vectorType = dynamic_cast<const VectorType*>(exprType);
-    if (vectorType != NULL)
-        return new VectorMemberExpr(e, id, p, idpos, vectorType);
+    const PointerType *pointerType = dynamic_cast<const PointerType *>(exprType);
+    if (pointerType != NULL)
+        exprType = pointerType->GetBaseType();
 
-    const ReferenceType *referenceType = dynamic_cast<const ReferenceType*>(exprType);
-    if (referenceType != NULL)
-        return new ReferenceMemberExpr(e, id, p, idpos, referenceType);
+    if (derefLValue == true && pointerType == NULL) {
+        if (dynamic_cast<const StructType *>(exprType->GetReferenceTarget()) != NULL)
+            Error(p, "Dereference operator \"->\" can't be applied to non-pointer "
+                  "type \"%s\".  Did you mean to use \".\"?", 
+                  exprType->GetString().c_str());
+        else
+            Error(p, "Dereference operator \"->\" can't be applied to non-struct "
+                  "pointer type \"%s\".", exprType->GetString().c_str());
+        return NULL;
+    }
+    if (derefLValue == false && pointerType != NULL &&
+        dynamic_cast<const StructType *>(pointerType->GetBaseType()) != NULL) {
+            Error(p, "Member operator \".\" can't be applied to pointer "
+                  "type \"%s\".  Did you mean to use \"->\"?", 
+                  exprType->GetString().c_str());
+        return NULL;
+    }
 
-    FATAL("Unexpected case in MemberExpr::create()");
-    return NULL;
+    if (dynamic_cast<const StructType *>(exprType) != NULL)
+        return new StructMemberExpr(e, id, p, idpos, derefLValue);
+    else if (dynamic_cast<const VectorType *>(exprType) != NULL)
+        return new VectorMemberExpr(e, id, p, idpos, derefLValue);
+    else {
+        Error(p, "Member operator \"%s\" can't be used with expression of "
+              "\"%s\" type.", derefLValue ? "->" : ".", 
+              exprType->GetString().c_str());
+        return NULL;
+    }
 }
 
 
-MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) 
+MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos,
+                       bool derefLValue) 
     : Expr(p), identifierPos(idpos) {
     expr = e;
     identifier = id;
+    dereferenceExpr = derefLValue;
 }
 
 
@@ -3214,6 +3545,8 @@ MemberExpr::GetValue(FunctionEmitContext *ctx) const {
         return NULL;
 
     llvm::Value *lvalue = GetLValue(ctx);
+    const Type *lvalueType = GetLValueType();
+
     llvm::Value *mask = NULL;
     if (lvalue == NULL) {
         // As in the array case, this may be a temporary that hasn't hit
@@ -3233,9 +3566,10 @@ MemberExpr::GetValue(FunctionEmitContext *ctx) const {
         int elementNumber = getElementNumber();
         if (elementNumber == -1)
             return NULL;
-        lvalue = ctx->GetElementPtrInst(ptr, 0, elementNumber);
-        lvalue = lAddVaryingOffsetsIfNeeded(ctx, lvalue, GetType(), getElementType());
 
+        lvalue = ctx->AddElementOffset(ptr, elementNumber, 
+                                       PointerType::GetUniform(exprType));
+        lvalueType = PointerType::GetUniform(GetType());
         mask = LLVMMaskAllOn;
     }
     else {
@@ -3245,7 +3579,7 @@ MemberExpr::GetValue(FunctionEmitContext *ctx) const {
     }
 
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(lvalue, mask, GetType(), "structelement");
+    return ctx->LoadInst(lvalue, mask, lvalueType, "structelement");
 }
 
 
@@ -3269,15 +3603,13 @@ MemberExpr::getElementNumber() const {
 
 llvm::Value *
 MemberExpr::GetLValue(FunctionEmitContext *ctx) const {
-    //This kindof feels like magic, but this functionality
-    // will have to be overridden in VectorMemberExpr when
-    // we support multi-swizzle.
     const Type *exprType;
     if (!expr || ((exprType = expr->GetType()) == NULL))
         return NULL;
 
     ctx->SetDebugPos(pos);
-    llvm::Value *basePtr = expr->GetLValue(ctx);
+    llvm::Value *basePtr = dereferenceExpr ? expr->GetValue(ctx) :
+        expr->GetLValue(ctx);
     if (!basePtr)
         return NULL;
 
@@ -3285,14 +3617,34 @@ MemberExpr::GetLValue(FunctionEmitContext *ctx) const {
     if (elementNumber == -1)
         return NULL;
 
+    const Type *exprLValueType = dereferenceExpr ? expr->GetType() :
+        expr->GetLValueType();
     ctx->SetDebugPos(pos);
-    llvm::Value *ptr = ctx->GetElementPtrInst(basePtr, 0, elementNumber);
-    ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetType(), getElementType());
+    llvm::Value *ptr = ctx->AddElementOffset(basePtr, elementNumber,
+                                             exprLValueType);
+
+    ptr = lAddVaryingOffsetsIfNeeded(ctx, ptr, GetLValueType());
 
     return ptr;
 }
 
 
+const Type *
+MemberExpr::GetLValueType() const {
+    if (expr == NULL)
+        return NULL;
+
+    const Type *exprLValueType = dereferenceExpr ? expr->GetType() :
+        expr->GetLValueType();
+    if (exprLValueType == NULL)
+        return NULL;
+
+    return exprLValueType->IsUniformType() ?
+        PointerType::GetUniform(getElementType()) : 
+        PointerType::GetVarying(getElementType());
+}
+
+
 Expr *
 MemberExpr::TypeCheck() {
     if (expr) 
@@ -3311,9 +3663,11 @@ MemberExpr::Optimize() {
 
 int
 MemberExpr::EstimateCost() const {
-    // FIXME: return gather cost when we can tell a gather is going to be
-    // needed
-    return COST_SIMPLE_ARITH_LOGIC_OP;
+    const Type *lvalueType = GetLValueType();
+    if (lvalueType != NULL && lvalueType->IsVaryingType())
+        return COST_GATHER + COST_SIMPLE_ARITH_LOGIC_OP;
+    else
+        return COST_SIMPLE_ARITH_LOGIC_OP;
 }
 
 
@@ -3718,10 +4072,6 @@ lConvertElement(From from, To *to) {
 
 /** When converting from bool types to numeric types, make sure the result
     is one or zero.
-    FIXME: this is a different rule than we use elsewhere, where we sign extend
-    the bool.  We should fix the other case to just zero extend and then
-    patch up places in the stdlib that depend on sign extension to call a 
-    routine to make that happen.
  */ 
 template <typename To> static inline void
 lConvertElement(bool from, To *to) {
@@ -4689,12 +5039,14 @@ lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal,
     // If we also want to go from uniform to varying, replicate out the
     // value across the vector elements..
     if (toType->IsVaryingType() && fromType->IsUniformType())
-        return ctx->SmearScalar(cast);
+        return ctx->SmearUniform(cast);
     else
         return cast;
 }
 
 
+// FIXME: fold this into the FunctionEmitContext::SmearUniform() method?
+
 /** Converts the given value of the given type to be the varying
     equivalent, returning the resulting value.
  */
@@ -4705,14 +5057,15 @@ lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
     if (type->IsVaryingType())
         return value;
 
-    LLVM_TYPE_CONST llvm::Type *llvmType = type->GetAsVaryingType()->LLVMType(g->ctx);
-    llvm::Value *retValue = llvm::UndefValue::get(llvmType);
-
     // for structs/arrays/vectors, just recursively make their elements
     // varying (if needed) and populate the return value.
     const CollectionType *collectionType = 
         dynamic_cast<const CollectionType *>(type);
     if (collectionType != NULL) {
+        LLVM_TYPE_CONST llvm::Type *llvmType = 
+            type->GetAsVaryingType()->LLVMType(g->ctx);
+        llvm::Value *retValue = llvm::UndefValue::get(llvmType);
+
         for (int i = 0; i < collectionType->GetElementCount(); ++i) {
             llvm::Value *v = ctx->ExtractInst(value, i, "get_element");
             v = lUniformValueToVarying(ctx, v, collectionType->GetElementType(i));
@@ -4724,11 +5077,10 @@ lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
     // Otherwise we must have a uniform AtomicType, so smear its value
     // across the vector lanes.
     assert(dynamic_cast<const AtomicType *>(type) != NULL);
-    return ctx->SmearScalar(value);
+    return ctx->SmearUniform(value);
 }
 
 
-
 llvm::Value *
 TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!expr)
@@ -4743,28 +5095,103 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
 
     const PointerType *fromPointerType = dynamic_cast<const PointerType *>(fromType);
     const PointerType *toPointerType = dynamic_cast<const PointerType *>(toType);
-    if (fromPointerType != NULL && toPointerType != NULL) {
-        llvm::Value *value = expr->GetValue(ctx);
-        if (value == NULL)
-            return NULL;
+    const ArrayType *toArrayType = dynamic_cast<const ArrayType *>(toType);
+    const ArrayType *fromArrayType = dynamic_cast<const ArrayType *>(fromType);
+    if (fromPointerType != NULL) {
+        if (toArrayType != NULL) {
+            return expr->GetValue(ctx);
+        }
+        else if (toPointerType != NULL) {
+            llvm::Value *value = expr->GetValue(ctx);
+            if (value == NULL)
+                return NULL;
 
-        // bitcast from NULL to actual pointer type...
-        value = ctx->BitCastInst(value, toType->GetAsUniformType()->LLVMType(g->ctx));
+            if (fromType->IsUniformType() && toType->IsUniformType())
+                // bitcast to the actual pointer type
+                return ctx->BitCastInst(value, toType->LLVMType(g->ctx));
+            else if (fromType->IsVaryingType() && toType->IsVaryingType()) {
+                // both are vectors of ints already, nothing to do at the IR
+                // level
+                return value;
+            }
+            else {
+                assert(fromType->IsUniformType() && toType->IsVaryingType());
+                value = ctx->PtrToIntInst(value);
+                return ctx->SmearUniform(value);
+            }
+        }
+        else {
+            // convert pointer to bool
+            assert(dynamic_cast<const AtomicType *>(toType) &&
+                   toType->IsBoolType());
+            LLVM_TYPE_CONST llvm::Type *lfu = 
+                fromType->GetAsUniformType()->LLVMType(g->ctx);
+            LLVM_TYPE_CONST llvm::PointerType *llvmFromUnifType = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(lfu);
 
-        if (fromType->IsUniformType() && toType->IsVaryingType())
-            return ctx->SmearScalar(value);
-        else
-            return value;
+            llvm::Value *nullPtrValue = llvm::ConstantPointerNull::get(llvmFromUnifType);
+            if (fromType->IsVaryingType())
+                nullPtrValue = ctx->SmearUniform(nullPtrValue);
+
+            llvm::Value *exprVal = expr->GetValue(ctx);
+            llvm::Value *cmp = ctx->CmpInst(llvm::Instruction::ICmp, 
+                                            llvm::CmpInst::ICMP_NE,
+                                            exprVal, nullPtrValue, "ptr_ne_NULL");
+
+            if (toType->IsVaryingType()) {
+                if (fromType->IsUniformType())
+                    cmp = ctx->SmearUniform(cmp);
+                cmp = ctx->I1VecToBoolVec(cmp);
+            }
+
+            return cmp;
+        }
     }
 
-    if (Type::Equal(toType->GetAsConstType(), fromType->GetAsConstType()))
+    if (Type::EqualIgnoringConst(toType, fromType))
         // There's nothing to do, just return the value.  (LLVM's type
         // system doesn't worry about constiness.)
         return expr->GetValue(ctx);
 
+    if (fromArrayType != NULL && toPointerType != NULL) {
+        // implicit array to pointer to first element
+        Expr *zero = new ConstExpr(AtomicType::UniformInt32, 0, pos);
+        Expr *index = new IndexExpr(expr, zero, pos);
+        Expr *addr = new AddressOfExpr(index, pos);
+        addr = addr->TypeCheck();
+        assert(addr != NULL);
+        addr = addr->Optimize();
+        assert(addr != NULL);
+        if (Type::EqualIgnoringConst(addr->GetType(), toPointerType) == false) {
+            assert(Type::EqualIgnoringConst(addr->GetType()->GetAsVaryingType(),
+                                            toPointerType) == true);
+            addr = new TypeCastExpr(toPointerType, addr, false, pos);
+            addr = addr->TypeCheck();
+            assert(addr != NULL);
+            addr = addr->Optimize();
+            assert(addr != NULL);
+        }
+        assert(Type::EqualIgnoringConst(addr->GetType(), toPointerType));
+        return addr->GetValue(ctx);
+    }
+
     // This also should be caught during typechecking
     assert(!(toType->IsUniformType() && fromType->IsVaryingType()));
 
+    if (toArrayType != NULL && fromArrayType != NULL) {
+        // cast array pointer from [n x foo] to [0 x foo] if needed to be able
+        // to pass to a function that takes an unsized array as a parameter
+        if (toArrayType->GetElementCount() != 0 && 
+            (toArrayType->GetElementCount() != fromArrayType->GetElementCount()))
+            Warning(pos, "Type-converting array of length %d to length %d",
+                    fromArrayType->GetElementCount(), toArrayType->GetElementCount());
+        assert(Type::EqualIgnoringConst(toArrayType->GetBaseType(),
+                                        fromArrayType->GetBaseType()));
+        llvm::Value *v = expr->GetValue(ctx);
+        LLVM_TYPE_CONST llvm::Type *ptype = toType->LLVMType(g->ctx);
+        return ctx->BitCastInst(v, ptype); //, "array_cast_0size");
+    }
+
     const ReferenceType *toReference = dynamic_cast<const ReferenceType *>(toType);
     const ReferenceType *fromReference = dynamic_cast<const ReferenceType *>(fromType);
     if (toReference && fromReference) {
@@ -4780,8 +5207,8 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
                (toArray->GetElementCount() != fromArray->GetElementCount()))
                 Warning(pos, "Type-converting array of length %d to length %d",
                         fromArray->GetElementCount(), toArray->GetElementCount());
-            assert(Type::Equal(toArray->GetBaseType()->GetAsConstType(),
-                               fromArray->GetBaseType()->GetAsConstType()));
+            assert(Type::EqualIgnoringConst(toArray->GetBaseType(),
+                                            fromArray->GetBaseType()));
             llvm::Value *v = expr->GetValue(ctx);
             LLVM_TYPE_CONST llvm::Type *ptype = toType->LLVMType(g->ctx);
             return ctx->BitCastInst(v, ptype); //, "array_cast_0size");
@@ -4850,32 +5277,6 @@ TypeCastExpr::GetValue(FunctionEmitContext *ctx) const {
         toType = toEnum->IsUniformType() ? AtomicType::UniformUInt32 :
             AtomicType::VaryingUInt32;
 
-    if (fromPointerType != NULL) {
-        // convert pointer to bool
-        assert(dynamic_cast<const AtomicType *>(toType) &&
-               toType->IsBoolType());
-        LLVM_TYPE_CONST llvm::Type *lfu = 
-            fromType->GetAsUniformType()->LLVMType(g->ctx);
-        LLVM_TYPE_CONST llvm::PointerType *llvmFromUnifType = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(lfu);
-
-        llvm::Value *nullPtrValue = llvm::ConstantPointerNull::get(llvmFromUnifType);
-        if (fromType->IsVaryingType())
-            nullPtrValue = ctx->SmearScalar(nullPtrValue);
-
-        llvm::Value *cmp = ctx->CmpInst(llvm::Instruction::ICmp, 
-                                        llvm::CmpInst::ICMP_NE,
-                                        exprVal, nullPtrValue, "ptr_ne_NULL");
-
-        if (toType->IsVaryingType()) {
-            if (fromType->IsUniformType())
-                cmp = ctx->SmearScalar(cmp);
-            cmp = ctx->I1VecToBoolVec(cmp);
-        }
-
-        return cmp;
-    }
-
     const AtomicType *fromAtomic = dynamic_cast<const AtomicType *>(fromType);
     // at this point, coming from an atomic type is all that's left...
     assert(fromAtomic != NULL);
@@ -4908,6 +5309,17 @@ TypeCastExpr::GetType() const {
 }
 
 
+static const Type *
+lDeconstifyType(const Type *t) {
+    const PointerType *pt = dynamic_cast<const PointerType *>(t);
+    if (pt != NULL)
+        return new PointerType(lDeconstifyType(pt->GetBaseType()), 
+                               pt->IsUniformType(), false);
+    else
+        return t->GetAsNonConstType();
+}
+
+
 Expr *
 TypeCastExpr::TypeCheck() {
     if (expr != NULL) 
@@ -4926,64 +5338,30 @@ TypeCastExpr::TypeCheck() {
         return tce->TypeCheck();
     }
 
-    const char *toTypeString = toType->GetString().c_str();
-    const char *fromTypeString = fromType->GetString().c_str();
+    fromType = lDeconstifyType(fromType);
+    toType = lDeconstifyType(toType);
 
-    // It's an error to attempt to convert from varying to uniform
-    if (toType->IsUniformType() && !fromType->IsUniformType()) {
-        Error(pos, "Can't assign 'varying' value to 'uniform' type \"%s\".",
-              toTypeString);
+    // First some special cases that we allow only with an explicit type cast
+    const PointerType *ptFrom = dynamic_cast<const PointerType *>(fromType);
+    const PointerType *ptTo = dynamic_cast<const PointerType *>(toType);
+    if (ptFrom != NULL && ptTo != NULL)
+        // allow explicit typecasts between any two different pointer types
+        return this;
+
+    const AtomicType *fromAtomic = dynamic_cast<const AtomicType *>(fromType);
+    const AtomicType *toAtomic = dynamic_cast<const AtomicType *>(toType);
+    const EnumType *fromEnum = dynamic_cast<const EnumType *>(fromType);
+    const EnumType *toEnum = dynamic_cast<const EnumType *>(toType);
+    if ((fromAtomic || fromEnum) && (toAtomic || toEnum))
+        // Allow explicit casts between all of these
+        return this;
+
+    // And otherwise see if it's one of the conversions allowed to happen
+    // implicitly.
+    if (CanConvertTypes(fromType, toType, "type cast expression", pos) == false)
         return NULL;
-    }
 
-    // And any kind of void type in a type cast doesn't make sense
-    if (toType == AtomicType::Void || fromType == AtomicType::Void) {
-        Error(pos, "Void type illegal in type cast from type \"%s\" to "
-              "type \"%s\".", fromTypeString, toTypeString);
-        return NULL;
-    }
-
-    // FIXME: do we need to worry more about references here?
-
-    if (dynamic_cast<const VectorType *>(fromType) != NULL) {
-        // Starting from a vector type; the result type must be a vector
-        // type as well
-        if (dynamic_cast<const VectorType *>(toType) == NULL) {
-            Error(pos, "Can't convert vector type \"%s\" to non-vector type \"%s\".",
-                  fromTypeString, toTypeString);
-            return NULL;
-        }
-
-        // And the two vectors must have the same number of elements
-        if (dynamic_cast<const VectorType *>(toType)->GetElementCount() != 
-            dynamic_cast<const VectorType *>(fromType)->GetElementCount()) {
-            Error(pos, "Can't convert vector type \"%s\" to differently-sized "
-                  "vector type \"%s\".", fromTypeString, toTypeString);
-            return NULL;
-        }
-
-        // And we're ok; since vectors can only hold AtomicTypes, we know
-        // that type converting the elements will work.
-        return this;
-    }
-    else if (dynamic_cast<const ArrayType *>(fromType) != NULL) {
-        FATAL("Shouldn't ever get here");
-        return this;
-    }
-    else {
-        assert(dynamic_cast<const AtomicType *>(fromType) != NULL ||
-               dynamic_cast<const EnumType *>(fromType) != NULL);
-        // If we're going from an atomic or enum type, the only possible
-        // result is another atomic or enum type
-        if (dynamic_cast<const AtomicType *>(toType) == NULL &&
-            dynamic_cast<const EnumType *>(toType) == NULL) {
-            Error(pos, "Can't convert from type \"%s\" to \"%s\".",
-                  fromTypeString, toTypeString);
-            return NULL;
-        }
-
-        return this;
-    }
+    return this;
 }
 
 
@@ -5113,13 +5491,13 @@ TypeCastExpr::GetConstant(const Type *constType) const {
     if (ec == NULL)
         return NULL;
 
+    ec = llvm::ConstantExpr::getPtrToInt(ec, LLVMTypes::PointerIntType);
+
+    assert(type->IsVaryingType());
     std::vector<llvm::Constant *> smear;
     for (int i = 0; i < g->target.vectorWidth; ++i)
         smear.push_back(ec);
-    LLVM_TYPE_CONST llvm::ArrayType *llvmVaryingType =
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type->LLVMType(g->ctx));
-    assert(llvmVaryingType != NULL);
-    return llvm::ConstantArray::get(llvmVaryingType, smear);
+    return llvm::ConstantVector::get(smear);
 }
 
 
@@ -5154,7 +5532,20 @@ ReferenceExpr::GetType() const {
     if (!type) 
         return NULL;
 
-    return new ReferenceType(type, false);
+    return new ReferenceType(type);
+}
+
+
+const Type *
+ReferenceExpr::GetLValueType() const {
+    if (!expr) 
+        return NULL;
+
+    const Type *type = expr->GetType();
+    if (!type) 
+        return NULL;
+
+    return PointerType::GetUniform(type);
 }
 
 
@@ -5213,12 +5604,16 @@ DereferenceExpr::GetValue(FunctionEmitContext *ctx) const {
     llvm::Value *ptr = expr->GetValue(ctx);
     if (ptr == NULL)
         return NULL;
-    const Type *type = GetType();
+    const Type *type = expr->GetType();
     if (type == NULL)
         return NULL;
 
+    Symbol *baseSym = expr->GetBaseSymbol();
+    assert(baseSym != NULL);
+    llvm::Value *mask = lMaskForSymbol(baseSym, ctx);
+
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(ptr, NULL, type, "reference_load");
+    return ctx->LoadInst(ptr, mask, type, "deref_load");
 }
 
 
@@ -5230,6 +5625,14 @@ DereferenceExpr::GetLValue(FunctionEmitContext *ctx) const {
 }
 
 
+const Type *
+DereferenceExpr::GetLValueType() const {
+    if (expr == NULL)
+        return NULL;
+    return expr->GetType();
+}
+
+
 Symbol *
 DereferenceExpr::GetBaseSymbol() const {
     return expr ? expr->GetBaseSymbol() : NULL;
@@ -5238,8 +5641,20 @@ DereferenceExpr::GetBaseSymbol() const {
 
 const Type *
 DereferenceExpr::GetType() const {
-    return (expr && expr->GetType()) ? expr->GetType()->GetReferenceTarget() : 
-        NULL;
+    if (expr == NULL)
+        return NULL;
+    const Type *exprType = expr->GetType();
+    if (exprType == NULL)
+        return NULL;
+    if (dynamic_cast<const ReferenceType *>(exprType) != NULL)
+        return exprType->GetReferenceTarget();
+    else {
+        assert(dynamic_cast<const PointerType *>(exprType) != NULL);
+        if (exprType->IsUniformType())
+            return exprType->GetBaseType();
+        else
+            return exprType->GetBaseType()->GetAsVaryingType();
+    }
 }
 
 
@@ -5265,7 +5680,17 @@ DereferenceExpr::Optimize() {
 
 int
 DereferenceExpr::EstimateCost() const {
-    return COST_DEREF;
+    if (expr == NULL)
+        return 0;
+
+    const Type *exprType = expr->GetType();
+    if (dynamic_cast<const PointerType *>(exprType) &&
+        exprType->IsVaryingType())
+        // Be pessimistic; some of these will later be optimized into
+        // vector loads/stores..
+        return COST_GATHER + COST_DEREF;
+    else
+        return COST_DEREF;
 }
 
 
@@ -5281,6 +5706,150 @@ DereferenceExpr::Print() const {
 }
 
 
+///////////////////////////////////////////////////////////////////////////
+// AddressOfExpr
+
+AddressOfExpr::AddressOfExpr(Expr *e, SourcePos p)
+    : Expr(p), expr(e) {
+}
+
+
+llvm::Value *
+AddressOfExpr::GetValue(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+    if (expr == NULL)
+        return NULL;
+
+    const Type *exprType = expr->GetType();
+    if (dynamic_cast<const ReferenceType *>(exprType) != NULL)
+        return expr->GetValue(ctx);
+    else
+        return expr->GetLValue(ctx);
+}
+
+
+const Type *
+AddressOfExpr::GetType() const {
+    if (expr == NULL)
+        return NULL;
+
+    const Type *exprType = expr->GetType();
+    if (dynamic_cast<const ReferenceType *>(exprType) != NULL)
+        return PointerType::GetUniform(exprType->GetReferenceTarget());
+    else
+        return expr->GetLValueType();
+}
+
+
+Symbol *
+AddressOfExpr::GetBaseSymbol() const {
+    return expr ? expr->GetBaseSymbol() : NULL;
+}
+
+
+void
+AddressOfExpr::Print() const {
+    printf("&(");
+    if (expr)
+        expr->Print();
+    else
+        printf("NULL expr");
+    printf(")");
+    pos.Print();
+}
+
+
+Expr *
+AddressOfExpr::TypeCheck() {
+    if (expr != NULL)
+        expr = expr->TypeCheck();
+    return this;
+}
+
+
+Expr *
+AddressOfExpr::Optimize() {
+    if (expr != NULL)
+        expr = expr->Optimize();
+    return this;
+}
+
+
+int
+AddressOfExpr::EstimateCost() const {
+    return 0;
+}
+
+
+///////////////////////////////////////////////////////////////////////////
+// SizeOfExpr
+
+SizeOfExpr::SizeOfExpr(Expr *e, SourcePos p) 
+    : Expr(p), expr(e), type(NULL) {
+}
+
+
+SizeOfExpr::SizeOfExpr(const Type *t, SourcePos p)
+    : Expr(p), expr(NULL), type(t) {
+}
+
+
+llvm::Value *
+SizeOfExpr::GetValue(FunctionEmitContext *ctx) const {
+    ctx->SetDebugPos(pos);
+    const Type *t = expr ? expr->GetType() : type;
+    if (t == NULL)
+        return NULL;
+
+    LLVM_TYPE_CONST llvm::Type *llvmType = t->LLVMType(g->ctx);
+    if (llvmType == NULL)
+        return NULL;
+
+    return g->target.SizeOf(llvmType);
+}
+
+
+const Type *
+SizeOfExpr::GetType() const {
+    return (g->target.is32Bit || g->opt.force32BitAddressing) ? 
+        AtomicType::UniformUInt32 : AtomicType::UniformUInt64;
+}
+
+
+void
+SizeOfExpr::Print() const {
+    printf("Sizeof (");
+    if (expr != NULL) 
+        expr->Print();
+    const Type *t = expr ? expr->GetType() : type;
+    if (t != NULL)
+        printf(" [type %s]", t->GetString().c_str());
+    printf(")");
+    pos.Print();
+}
+
+
+Expr *
+SizeOfExpr::TypeCheck() {
+    if (expr != NULL)
+        expr = expr->TypeCheck();
+    return this;
+}
+
+
+Expr *
+SizeOfExpr::Optimize() {
+    if (expr != NULL)
+        expr = expr->Optimize();
+    return this;
+}
+
+
+int
+SizeOfExpr::EstimateCost() const {
+    return 0;
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // SymbolExpr
 
@@ -5296,7 +5865,7 @@ SymbolExpr::GetValue(FunctionEmitContext *ctx) const {
     if (!symbol || !symbol->storagePtr)
         return NULL;
     ctx->SetDebugPos(pos);
-    return ctx->LoadInst(symbol->storagePtr, NULL, NULL, symbol->name.c_str());
+    return ctx->LoadInst(symbol->storagePtr, symbol->name.c_str());
 }
 
 
@@ -5309,6 +5878,15 @@ SymbolExpr::GetLValue(FunctionEmitContext *ctx) const {
 }
 
 
+const Type *
+SymbolExpr::GetLValueType() const {
+    if (symbol == NULL)
+        return NULL;
+
+    return PointerType::GetUniform(symbol->type);
+}
+
+
 Symbol *
 SymbolExpr::GetBaseSymbol() const {
     return symbol;
@@ -5432,8 +6010,7 @@ FunctionSymbolExpr::GetConstant(const Type *type) const {
     assert(type->IsUniformType());
     assert(GetType()->IsUniformType());
 
-    if (Type::Equal(type->GetAsConstType(),
-                    GetType()->GetAsConstType()) == false)
+    if (Type::EqualIgnoringConst(type, GetType()) == false)
         return NULL;
 
     return matchingFunc ? matchingFunc->function : NULL;
@@ -5448,38 +6025,32 @@ lGetFunctionDeclaration(const std::string &name, const FunctionType *type) {
     ret += name;
     ret += "(";
 
-    const std::vector<const Type *> &argTypes = type->GetArgumentTypes();
-    const std::vector<ConstExpr *> &argDefaults = type->GetArgumentDefaults();
+    for (int i = 0; i < type->GetNumParameters(); ++i) {
+        const Type *paramType = type->GetParameterType(i);
+        ConstExpr *paramDefault = type->GetParameterDefault(i);
 
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
-        // If the parameter is a reference to an array, just print its type
-        // as the array type, since we always pass arrays by reference.
-        if (dynamic_cast<const ReferenceType *>(argTypes[i]) &&
-            dynamic_cast<const ArrayType *>(argTypes[i]->GetReferenceTarget()))
-            ret += argTypes[i]->GetReferenceTarget()->GetString();
-        else
-            ret += argTypes[i]->GetString();
+        ret += paramType->GetString();
         ret += " ";
-        ret += type->GetArgumentName(i);
+        ret += type->GetParameterName(i);
 
         // Print the default value if present
-        if (argDefaults[i] != NULL) {
+        if (paramDefault != NULL) {
             char buf[32];
-            if (argTypes[i]->IsFloatType()) {
+            if (paramType->IsFloatType()) {
                 double val;
-                int count = argDefaults[i]->AsDouble(&val);
+                int count = paramDefault->AsDouble(&val);
                 assert(count == 1);
                 sprintf(buf, " = %g", val);
             }
-            else if (argTypes[i]->IsBoolType()) {
+            else if (paramType->IsBoolType()) {
                 bool val;
-                int count = argDefaults[i]->AsBool(&val);
+                int count = paramDefault->AsBool(&val);
                 assert(count == 1);
                 sprintf(buf, " = %s", val ? "true" : "false");
             }
-            else if (argTypes[i]->IsUnsignedType()) {
+            else if (paramType->IsUnsignedType()) {
                 uint64_t val;
-                int count = argDefaults[i]->AsUInt64(&val);
+                int count = paramDefault->AsUInt64(&val);
                 assert(count == 1);
 #ifdef ISPC_IS_LINUX
                 sprintf(buf, " = %lu", val);
@@ -5489,7 +6060,7 @@ lGetFunctionDeclaration(const std::string &name, const FunctionType *type) {
             }
             else { 
                 int64_t val;
-                int count = argDefaults[i]->AsInt64(&val);
+                int count = paramDefault->AsInt64(&val);
                 assert(count == 1);
 #ifdef ISPC_IS_LINUX
                 sprintf(buf, " = %ld", val);
@@ -5499,7 +6070,7 @@ lGetFunctionDeclaration(const std::string &name, const FunctionType *type) {
             }
             ret += buf;
         }
-        if (i != argTypes.size() - 1)
+        if (i != type->GetNumParameters() - 1)
             ret += ", ";
     }
     ret += ")";
@@ -5564,7 +6135,7 @@ lExactMatch(const Type *callType, const Type *funcArgType) {
         callType = callType->GetAsNonConstType();
     if (dynamic_cast<const ReferenceType *>(funcArgType) != NULL && 
         dynamic_cast<const ReferenceType *>(callType) == NULL)
-        callType = new ReferenceType(callType, funcArgType->IsConstType());
+        callType = new ReferenceType(callType);
 
     return Type::Equal(callType, funcArgType) ? 0 : -1;
 }
@@ -5737,43 +6308,43 @@ FunctionSymbolExpr::tryResolve(int (*matchFunc)(const Type *, const Type *),
         const FunctionType *ft = 
             dynamic_cast<const FunctionType *>(candidateFunction->type);
         assert(ft != NULL);
-        const std::vector<const Type *> &funcArgTypes = ft->GetArgumentTypes();
-        const std::vector<ConstExpr *> &argumentDefaults = ft->GetArgumentDefaults();
 
         // There's no way to match if the caller is passing more arguments
         // than this function instance takes.
-        if (callTypes.size() > funcArgTypes.size())
+        if ((int)callTypes.size() > ft->GetNumParameters())
             continue;
 
-        unsigned int i;
+        int i;
         // Note that we're looping over the caller arguments, not the
         // function arguments; it may be ok to have more arguments to the
         // function than are passed, if the function has default argument
         // values.  This case is handled below.
         int cost = 0;
-        for (i = 0; i < callTypes.size(); ++i) {
+        for (i = 0; i < (int)callTypes.size(); ++i) {
             // This may happen if there's an error earlier in compilation.
             // It's kind of a silly to redundantly discover this for each
             // potential match versus detecting this earlier in the
             // matching process and just giving up.
-            if (callTypes[i] == NULL || funcArgTypes[i] == NULL ||
+            const Type *paramType = ft->GetParameterType(i);
+            if (callTypes[i] == NULL || paramType == NULL ||
                 dynamic_cast<const FunctionType *>(callTypes[i]) != NULL)
                 return false;
             
-            int argCost = matchFunc(callTypes[i], funcArgTypes[i]);
+            int argCost = matchFunc(callTypes[i], paramType);
             if (argCost == -1)
                 // If the predicate function returns -1, we have failed no
                 // matter what else happens, so we stop trying
                 break;
             cost += argCost;
         }
-        if (i == callTypes.size()) {
+        if (i == (int)callTypes.size()) {
             // All of the arguments matched!
-            if (i == funcArgTypes.size())
+            if (i == ft->GetNumParameters())
                 // And we have exactly as many arguments as the function
                 // wants, so we're done.
                 matches.push_back(std::make_pair(cost, candidateFunction));
-            else if (i < funcArgTypes.size() && argumentDefaults[i] != NULL)
+            else if (i < ft->GetNumParameters() && 
+                     ft->GetParameterDefault(i) != NULL)
                 // Otherwise we can still make it if there are default
                 // arguments for the rest of the arguments!  Because in
                 // Module::AddFunction() we have verified that once the
diff --git a/expr.h b/expr.h
index 17ec622b..a6720c03 100644
--- a/expr.h
+++ b/expr.h
@@ -65,6 +65,10 @@ public:
     /** Returns the Type of the expression. */
     virtual const Type *GetType() const = 0;
 
+    /** Returns the type of the value returned by GetLValueType(); this
+        should be a pointer type of some sort (uniform or varying). */
+    virtual const Type *GetLValueType() const;
+
     /** For expressions that have values based on a symbol (e.g. regular
         symbol references, array indexing, etc.), this returns a pointer to
         that symbol. */
@@ -266,11 +270,12 @@ public:
 */
 class IndexExpr : public Expr {
 public:
-    IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
+    IndexExpr(Expr *baseExpr, Expr *index, SourcePos p);
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     Symbol *GetBaseSymbol() const;
     void Print() const;
 
@@ -278,7 +283,7 @@ public:
     Expr *TypeCheck();
     int EstimateCost() const;
 
-    Expr *arrayOrVector, *index;
+    Expr *baseExpr, *index;
 };
 
 
@@ -288,15 +293,13 @@ public:
  */
 class MemberExpr : public Expr {
 public:
-    static MemberExpr* create(Expr *expr, const char *identifier,
-                              SourcePos pos, SourcePos identifierPos);
-
-    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
-               SourcePos identifierPos);
-
+    static MemberExpr *create(Expr *expr, const char *identifier,
+                              SourcePos pos, SourcePos identifierPos,
+                              bool derefLvalue);
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     Symbol *GetBaseSymbol() const;
     void Print() const;
     Expr *Optimize();
@@ -310,6 +313,15 @@ public:
     Expr *expr;
     std::string identifier;
     const SourcePos identifierPos;
+
+protected:
+    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
+               SourcePos identifierPos, bool derefLValue);
+
+    /** Indicates whether the expression should be dereferenced before the
+        member is found.  (i.e. this is true if the MemberExpr was a '->'
+        operator, and is false if it was a '.' operator. */
+    bool dereferenceExpr;
 };
 
 
@@ -506,6 +518,7 @@ public:
 
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     Symbol *GetBaseSymbol() const;
     void Print() const;
     Expr *TypeCheck();
@@ -525,6 +538,7 @@ public:
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     Symbol *GetBaseSymbol() const;
     void Print() const;
     Expr *TypeCheck();
@@ -535,6 +549,44 @@ public:
 };
 
 
+/** Expression that represents taking the address of an expression. */
+class AddressOfExpr : public Expr {
+public:
+    AddressOfExpr(Expr *e, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    Symbol *GetBaseSymbol() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    int EstimateCost() const;
+
+    Expr *expr;
+};
+
+
+/** Expression that returns the size of the given expression or type in
+    bytes. */
+class SizeOfExpr : public Expr {
+public:
+    SizeOfExpr(Expr *e, SourcePos p);
+    SizeOfExpr(const Type *t, SourcePos p);
+
+    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
+    const Type *GetType() const;
+    void Print() const;
+    Expr *TypeCheck();
+    Expr *Optimize();
+    int EstimateCost() const;
+
+    /* One of expr or type should be non-NULL (but not both of them).  The
+       SizeOfExpr returns the size of whichever one of them isn't NULL. */
+    Expr *expr;
+    const Type *type;
+};
+
+
 /** @brief Expression representing a symbol reference in the program */
 class SymbolExpr : public Expr {
 public:
@@ -543,6 +595,7 @@ public:
     llvm::Value *GetValue(FunctionEmitContext *ctx) const;
     llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
     const Type *GetType() const;
+    const Type *GetLValueType() const;
     Symbol *GetBaseSymbol() const;
     Expr *TypeCheck();
     Expr *Optimize();
@@ -623,9 +676,13 @@ public:
 
 
 /** This function indicates whether it's legal to convert from fromType to
-    toType.
+    toType.  If the optional errorMsgBase and source position parameters
+    are provided, then an error message is issued if the type conversion
+    isn't possible.
  */
-bool CanConvertTypes(const Type *fromType, const Type *toType);
+bool CanConvertTypes(const Type *fromType, const Type *toType,
+                     const char *errorMsgBase = NULL,
+                     SourcePos pos = SourcePos());
 
 /** This function attempts to convert the given expression to the given
     type, returning a pointer to a new expression that is the result.  If
diff --git a/func.cpp b/func.cpp
index 5be26871..7c6895a2 100644
--- a/func.cpp
+++ b/func.cpp
@@ -74,10 +74,32 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
     maskSymbol = m->symbolTable->LookupVariable("__mask");
     assert(maskSymbol != NULL);
 
-    if (code) {
+    if (code != NULL) {
+        if (g->debugPrint) {
+            fprintf(stderr, "Creating function \"%s\".  Initial code:\n", 
+                    sym->name.c_str());
+            code->Print(0);
+            fprintf(stderr, "---------------------\n");
+        }
+
         code = code->TypeCheck();
-        if (code)
+
+        if (code != NULL && g->debugPrint) {
+            fprintf(stderr, "After typechecking function \"%s\":\n", 
+                    sym->name.c_str());
+            code->Print(0);
+            fprintf(stderr, "---------------------\n");
+        }
+
+        if (code != NULL) {
             code = code->Optimize();
+            if (g->debugPrint) {
+                fprintf(stderr, "After optimizing function \"%s\":\n", 
+                        sym->name.c_str());
+                code->Print(0);
+                fprintf(stderr, "---------------------\n");
+            }
+        }
     }
 
     if (g->debugPrint) {
@@ -149,11 +171,11 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
     sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
 
     // get a pointer to the value in the struct
-    llvm::Value *ptr = ctx->GetElementPtrInst(structArgPtr, 0, i, sym->name.c_str());
+    llvm::Value *ptr = ctx->AddElementOffset(structArgPtr, i, NULL, sym->name.c_str());
 
     // and copy the value from the struct and into the local alloca'ed
     // memory
-    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, NULL, sym->name.c_str());
+    llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
     ctx->StoreInst(ptrval, sym->storagePtr);
     ctx->EmitFunctionParameterDebugInfo(sym);
 }
@@ -200,9 +222,9 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         // Copy in the mask as well.
         int nArgs = (int)args.size();
         // The mask is the last parameter in the argument structure
-        llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
+        llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
                                                   "task_struct_mask");
-        llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, NULL, "mask");
+        llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
         ctx->SetFunctionMask(ptrval);
 
         // Copy threadIndex and threadCount into stack-allocated storage so
@@ -236,7 +258,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
         }
 
         // If the number of actual function arguments is equal to the
-        // number of declared arguments in decl->functionArgs, then we
+        // number of declared arguments in decl->functionParams, then we
         // don't have a mask parameter, so set it to be all on.  This
         // happens for exmaple with 'export'ed functions that the app
         // calls.
@@ -338,11 +360,8 @@ Function::GenerateIR() {
 
     if (m->errorCount == 0) {
         if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
-            if (g->debugPrint) {
-                llvm::PassManager ppm;
-                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
-                ppm.run(*m->module);
-            }
+            if (g->debugPrint)
+                function->dump();
             FATAL("Function verificication failed");
         }
 
@@ -376,11 +395,8 @@ Function::GenerateIR() {
                         sym->exportedFunction = appFunction;
                         if (llvm::verifyFunction(*appFunction, 
                                                  llvm::ReturnStatusAction) == true) {
-                            if (g->debugPrint) {
-                                llvm::PassManager ppm;
-                                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
-                                ppm.run(*m->module);
-                            }
+                            if (g->debugPrint)
+                                appFunction->dump();
                             FATAL("Function verificication failed");
                         }
                     }
diff --git a/ispc.cpp b/ispc.cpp
index 65dc0b67..188b753e 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -171,7 +171,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
     if (!error) {
         llvm::TargetMachine *targetMachine = t->GetTargetMachine();
         const llvm::TargetData *targetData = targetMachine->getTargetData();
-        t->is32bit = (targetData->getPointerSize() == 4);
+        t->is32Bit = (targetData->getPointerSize() == 4);
     }
 
     return !error;
@@ -284,8 +284,11 @@ llvm::Value *
 Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
     const llvm::TargetData *td = GetTargetMachine()->getTargetData();
     assert(td != NULL);
-    return is32bit ? LLVMInt32(td->getTypeSizeInBits(type) / 8) :
-        LLVMInt64(td->getTypeSizeInBits(type) / 8);
+    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
+    if (is32Bit || g->opt.force32BitAddressing)
+        return LLVMInt32(byteSize);
+    else
+        return LLVMInt64(byteSize);
 }
 
 
@@ -298,7 +301,12 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
     assert(structType != NULL);
     const llvm::StructLayout *sl = td->getStructLayout(structType);
     assert(sl != NULL);
-    return LLVMInt32(sl->getElementOffset(element));
+
+    uint64_t offset = sl->getElementOffset(element);
+    if (is32Bit || g->opt.force32BitAddressing)
+        return LLVMInt32(offset);
+    else
+        return LLVMInt64(offset);
 }
 
 
@@ -309,6 +317,7 @@ Opt::Opt() {
     level = 1;
     fastMath = false;
     fastMaskedVload = false;
+    force32BitAddressing = false;
     unrollLoops = true;
     disableAsserts = false;
     disableHandlePseudoMemoryOps = false;
diff --git a/ispc.h b/ispc.h
index 99299c63..7a96872c 100644
--- a/ispc.h
+++ b/ispc.h
@@ -187,7 +187,7 @@ struct Target {
     std::string arch;
 
     /** Is the target architecture 32 or 64 bit */
-    bool is32bit;
+    bool is32Bit;
 
     /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
     std::string cpu;
@@ -237,6 +237,12 @@ struct Opt {
         it will make sense. */
     bool unrollLoops;
 
+    /** Indicates if addressing math will be done with 32-bit math, even on
+        64-bit systems.  (This is generally noticably more efficient,
+        though at the cost of addressing >2GB).
+     */ 
+    bool force32BitAddressing;
+
     /** Indicates whether assert() statements should be ignored (for
         performance in the generated code). */
     bool disableAsserts;
diff --git a/lex.ll b/lex.ll
index b31315c4..c8ff9b1d 100644
--- a/lex.ll
+++ b/lex.ll
@@ -112,9 +112,12 @@ int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
 NULL { return TOKEN_NULL; }
 print { return TOKEN_PRINT; }
-reference { return TOKEN_REFERENCE; }
+reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
+                           "please use C++-style '&' syntax for references "
+                           "instead."); }
 return { return TOKEN_RETURN; }
 soa { return TOKEN_SOA; }
+sizeof { return TOKEN_SIZEOF; }
 static { return TOKEN_STATIC; }
 struct { return TOKEN_STRUCT; }
 switch { return TOKEN_SWITCH; }
@@ -223,6 +226,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 "&=" { return TOKEN_AND_ASSIGN; }
 "^=" { return TOKEN_XOR_ASSIGN; }
 "|=" { return TOKEN_OR_ASSIGN; }
+"->" { return TOKEN_PTR_OP; }
 ";"             { return ';'; }
 ("{"|"<%")      { return '{'; }
 ("}"|"%>")      { return '}'; }
@@ -266,8 +270,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 
 %%
 
-/*sizeof { return TOKEN_SIZEOF; }*/
-/*"->" { return TOKEN_PTR_OP; }*/
 /*short { return TOKEN_SHORT; }*/
 /*long { return TOKEN_LONG; }*/
 /*signed { return TOKEN_SIGNED; }*/
diff --git a/llvmutil.cpp b/llvmutil.cpp
index f31738cc..34e830d5 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -40,6 +40,7 @@
 
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
+LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
 
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
@@ -74,7 +75,7 @@ LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
 
-LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
 
 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -86,6 +87,8 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
     LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
+    LLVMTypes::PointerIntType = target.is32Bit ? llvm::Type::getInt32Ty(*ctx) :
+        llvm::Type::getInt64Ty(*ctx);
 
     LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
     LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
@@ -130,8 +133,8 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
     LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
 
-    LLVMTypes::VoidPointerVectorType = 
-        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
+    LLVMTypes::VoidPointerVectorType = g->target.is32Bit ? LLVMTypes::Int32VectorType :
+        LLVMTypes::Int64VectorType;
 
     LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
     LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
@@ -451,11 +454,3 @@ LLVMBoolVector(const bool *bvec) {
     }
     return llvm::ConstantVector::get(vals);
 }
-
-
-LLVM_TYPE_CONST llvm::ArrayType *
-LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
-    // NOTE: ArrayType, not VectorType
-    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
-                                g->target.vectorWidth);
-}
diff --git a/llvmutil.h b/llvmutil.h
index 68397b70..0322b49e 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -52,6 +52,7 @@
 struct LLVMTypes {
     static LLVM_TYPE_CONST llvm::Type *VoidType;
     static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
+    static LLVM_TYPE_CONST llvm::Type *PointerIntType;
     static LLVM_TYPE_CONST llvm::Type *BoolType;
 
     static LLVM_TYPE_CONST llvm::Type *Int8Type;
@@ -86,7 +87,7 @@ struct LLVMTypes {
     static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
     static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
 
-    static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *VoidPointerVectorType;
 };
 
 /** These variables hold the corresponding LLVM constant values as a
@@ -204,10 +205,4 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;
 
-/** Given an LLVM type, returns the corresponding type for a vector of
-    pointers to that type.  (In practice, an array of pointers, since LLVM
-    prohibits vectors of pointers.
- */
-extern LLVM_TYPE_CONST llvm::ArrayType *LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t);
-
 #endif // ISPC_LLVMUTIL_H
diff --git a/main.cpp b/main.cpp
index 11d54213..40ee8550 100644
--- a/main.cpp
+++ b/main.cpp
@@ -83,6 +83,7 @@ static void usage(int ret) {
     printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
     printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
     printf("    [--opt=<option>]\t\t\tSet optimization option\n");
+    printf("        32-bit-addressing\t\tUse 32-bit math for addressing calculations even on 64-bit targets.\n");
     printf("        disable-assertions\t\tRemove assertion statements from final code.\n");
     printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
     printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
@@ -248,6 +249,8 @@ int main(int Argc, char *Argv[]) {
                 g->opt.fastMath = true;
             else if (!strcmp(opt, "fast-masked-vload"))
                 g->opt.fastMaskedVload = true;
+            else if (!strcmp(opt, "32-bit-addressing"))
+                g->opt.force32BitAddressing = true;
             else if (!strcmp(opt, "disable-assertions"))
                 g->opt.disableAsserts = true;
             else if (!strcmp(opt, "disable-loop-unroll"))
diff --git a/module.cpp b/module.cpp
index 05810080..ca2e6fc9 100644
--- a/module.cpp
+++ b/module.cpp
@@ -250,6 +250,8 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
     }
         
     LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
+    if (llvmType == NULL)
+        return;
 
     // See if we have an initializer expression for the global.  If so,
     // make sure it's a compile-time constant!
@@ -365,12 +367,12 @@ lCheckForVaryingParameter(const Type *type, const std::string &name,
  */
 static void
 lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
-    const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
-        const Type *type = argTypes[i];
+    for (int i = 0; i < ftype->GetNumParameters(); ++i) {
+        const Type *type = ftype->GetParameterType(i);
         if (dynamic_cast<const StructType *>(type) != NULL) {
-            Error(pos, "Passing structs to/from application functions is currently broken. "
-                  "Use a reference or const reference instead for now.");
+            Error(pos, "Passing structs to/from application functions is "
+                  "currently broken. Use a pointer or const pointer to the "
+                  "struct instead for now.");
             return;
         }
     }
@@ -483,27 +485,32 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
     bool seenDefaultArg = false;
     int nArgs = functionType->GetNumParameters();
     for (int i = 0; i < nArgs; ++i) {
-        const Type *argType = (functionType->GetArgumentTypes())[i];
-        const std::string &argName = functionType->GetArgumentName(i);
-        ConstExpr *defaultValue = (functionType->GetArgumentDefaults())[i];
-        const SourcePos &argPos = (functionType->GetArgumentSourcePos())[i];
+        const Type *argType = functionType->GetParameterType(i);
+        const std::string &argName = functionType->GetParameterName(i);
+        ConstExpr *defaultValue = functionType->GetParameterDefault(i);
+        const SourcePos &argPos = functionType->GetParameterSourcePos(i);
 
         // If the function is exported, make sure that the parameter
         // doesn't have any varying stuff going on in it.
         if (funSym->storageClass == SC_EXPORT)
             lCheckForVaryingParameter(argType, argName, argPos);
 
-        // ISPC assumes that all memory passed in is aligned to the native
-        // width and that no pointers alias.  (It should be possible to
+        // ISPC assumes that no pointers alias.  (It should be possible to
         // specify when this is not the case, but this should be the
-        // default.)  Set parameter attributes accordingly.
+        // default.)  Set parameter attributes accordingly.  (Only for
+        // uniform pointers, since varying pointers are int vectors...)
         if (!functionType->isTask && 
-            dynamic_cast<const ReferenceType *>(argType) != NULL) {
+            ((dynamic_cast<const PointerType *>(argType) != NULL &&
+              argType->IsUniformType()) ||
+             dynamic_cast<const ReferenceType *>(argType) != NULL)) {
+
             // NOTE: LLVM indexes function parameters starting from 1.
             // This is unintuitive.
             function->setDoesNotAlias(i+1, true);
+#if 0
             int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
             function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
+#endif
         }
 
         if (symbolTable->LookupFunction(argName.c_str()) != NULL)
@@ -887,6 +894,9 @@ lGetExportedTypes(const Type *type,
     if (dynamic_cast<const ReferenceType *>(type) != NULL)
         lGetExportedTypes(type->GetReferenceTarget(), exportedStructTypes, 
                           exportedEnumTypes, exportedVectorTypes);
+    else if (dynamic_cast<const PointerType *>(type) != NULL)
+        lGetExportedTypes(type->GetBaseType(), exportedStructTypes,
+                          exportedEnumTypes, exportedVectorTypes);
     else if (arrayType != NULL)
         lGetExportedTypes(arrayType->GetElementType(), exportedStructTypes, 
                           exportedEnumTypes, exportedVectorTypes);
@@ -920,9 +930,8 @@ lGetExportedParamTypes(const std::vector<Symbol *> &funcs,
                           exportedEnumTypes, exportedVectorTypes);
 
         // And now the parameter types...
-        const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
-        for (unsigned int j = 0; j < argTypes.size(); ++j)
-            lGetExportedTypes(argTypes[j], exportedStructTypes,
+        for (int j = 0; j < ftype->GetNumParameters(); ++j)
+            lGetExportedTypes(ftype->GetParameterType(j), exportedStructTypes,
                               exportedEnumTypes, exportedVectorTypes);
     }
 }
diff --git a/opt.cpp b/opt.cpp
index 1846f3ad..966d4cae 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -723,25 +723,13 @@ CreateIntrinsicsOptPass() {
 
 /** When the front-end emits gathers and scatters, it generates an array of
     vector-width pointers to represent the set of addresses to read from or
-    write to.  However, because ispc doesn't support pointers, it turns
-    out to be the case that scatters and gathers always end up indexing
-    into an array with a common base pointer.  Therefore, this optimization
-    transforms the original arrays of general pointers into a single base
-    pointer and an array of offsets.
-
-    (Implementation seems to be easier with this approach versus having the
-    front-end try to emit base pointer + offset stuff from the start,
-    though arguably the latter approach would be a little more elegant.)
+    write to.  This optimization detects cases when the base pointer is a
+    uniform pointer or when the indexing is into an array that can be
+    converted into scatters/gathers from a single base pointer and an array
+    of offsets.
 
     See for example the comments discussing the __pseudo_gather functions
     in builtins.cpp for more information about this.
-
-    @todo The implementation of this is pretty messy, and it sure would be
-    nice to not have all the complexity of built-in assumptions of the
-    structure of how the front end will have generated code, all of the
-    instruction dyn_casts, etc.  Can we do something simpler, e.g. an early
-    pass to flatten out GEPs when the size is known, then do LLVM's
-    constant folding, then flatten into an array, etc.?
  */
 class GatherScatterFlattenOpt : public llvm::BasicBlockPass {
 public:
@@ -757,318 +745,176 @@ char GatherScatterFlattenOpt::ID = 0;
 llvm::RegisterPass<GatherScatterFlattenOpt> gsf("gs-flatten", "Gather/Scatter Flatten Pass");
 
 
-/** Given an llvm::Value known to be an unsigned integer, return its value as
+/** Given an llvm::Value known to be an integer, return its value as
     an int64_t.
 */
-static uint64_t
+static int64_t
 lGetIntValue(llvm::Value *offset) {
     llvm::ConstantInt *intOffset = llvm::dyn_cast<llvm::ConstantInt>(offset);
     assert(intOffset && (intOffset->getBitWidth() == 32 ||
                          intOffset->getBitWidth() == 64));
-    return intOffset->getZExtValue();
+    return intOffset->getSExtValue();
 }
 
+/** This function takes chains of InsertElement instructions along the
+    lines of:
 
-static llvm::Value *
-lGetTypeSize(LLVM_TYPE_CONST llvm::Type *type, llvm::Instruction *insertBefore) {
-    LLVM_TYPE_CONST llvm::ArrayType *arrayType =  
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type);
-    if (arrayType != NULL)
-        type = arrayType->getElementType();
+    %v0 = insertelement undef, value_0, i32 index_0
+    %v1 = insertelement %v1,   value_1, i32 index_1
+    ...
+    %vn = insertelement %vn-1, value_n-1, i32 index_n-1
 
-    llvm::Value *scale = g->target.SizeOf(type);
-    if (g->target.is32bit == false) {
-        scale = new llvm::TruncInst(scale, LLVMTypes::Int32Type, "sizeof32", 
-                                    insertBefore);
-        lCopyMetadata(scale, insertBefore);
-    }
-    return scale;
-}
-
-
-static llvm::Value *
-lTraverseConstantExpr(llvm::Constant *value, llvm::Value **offsetPtr,
-                      LLVM_TYPE_CONST llvm::Type **scaleType, 
-                      llvm::Instruction *insertBefore) {
-    llvm::GlobalVariable *gv = NULL;
-    llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(value);
-    if (ce != NULL) {
-        switch (ce->getOpcode()) {
-        case llvm::Instruction::BitCast:
-            *offsetPtr = LLVMInt32(0);
-            return lTraverseConstantExpr(ce->getOperand(0), offsetPtr, 
-                                         scaleType, insertBefore);
-        case llvm::Instruction::GetElementPtr: {
-            gv = llvm::dyn_cast<llvm::GlobalVariable>(ce->getOperand(0));
-            assert(gv != NULL);
-
-            assert(lGetIntValue(ce->getOperand(1)) == 0);
-            LLVM_TYPE_CONST llvm::PointerType *targetPtrType = 
-                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(ce->getOperand(0)->getType());
-            assert(targetPtrType);
-            LLVM_TYPE_CONST llvm::Type *targetType = targetPtrType->getElementType();
-
-            if (llvm::isa<const llvm::StructType>(targetType)) {
-                *offsetPtr = g->target.StructOffset(targetType, 
-                                                    lGetIntValue(ce->getOperand(2)));
-                *scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
-            }
-            else {
-                *offsetPtr = ce->getOperand(2);
-                assert(*scaleType == NULL || *scaleType == targetType);
-                *scaleType = targetType;
-            }
-            break;
-        }
-        default:
-            FATAL("Unexpected opcode in constant expression!");
-            //printf("other op %s\n", ce->getOpcodeName());
-            break;
-        }
-    }
-
-    if (gv == NULL)
-        gv = llvm::dyn_cast<llvm::GlobalVariable>(value);
-
-    return gv;
-}
-
-
-static llvm::Value *
-lGetOffsetForLane(int lane, llvm::Value *value, llvm::Value **offset, 
-                  LLVM_TYPE_CONST llvm::Type **scaleType,
-                  llvm::Instruction *insertBefore) {
-    if (!llvm::isa<llvm::GetElementPtrInst>(value)) {
-        assert(llvm::isa<llvm::BitCastInst>(value));
-        value = llvm::dyn_cast<llvm::BitCastInst>(value)->getOperand(0);
-
-        llvm::ExtractValueInst *ev = llvm::dyn_cast<llvm::ExtractValueInst>(value);
-        assert(ev->hasIndices() && ev->getNumIndices() == 1);
-        assert(int(*(ev->idx_begin())) == lane);
-
-        llvm::InsertValueInst *iv = llvm::dyn_cast<llvm::InsertValueInst>(ev->getOperand(0));
-        assert(iv->hasIndices() && iv->getNumIndices() == 1);
-        while (int(*(iv->idx_begin())) != lane) {
-            iv = llvm::dyn_cast<llvm::InsertValueInst>(iv->getOperand(0));
-            assert(iv && iv->hasIndices() && iv->getNumIndices() == 1);
-        }
-    
-        value = iv->getOperand(1);
-    }
-
-    llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(value);
-    assert(gep);
-
-    assert(lGetIntValue(gep->getOperand(1)) == 0);
-    LLVM_TYPE_CONST llvm::PointerType *targetPtrType = 
-        llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(gep->getOperand(0)->getType());
-    assert(targetPtrType);
-    LLVM_TYPE_CONST llvm::Type *targetType = targetPtrType->getElementType();
-
-    if (llvm::isa<const llvm::StructType>(targetType)) {
-        *offset = g->target.StructOffset(targetType, lGetIntValue(gep->getOperand(2)));
-        lCopyMetadata(*offset, insertBefore);
-        *scaleType = LLVMTypes::Int8Type; // aka char aka sizeof(1)
-    }
-    else {
-        *offset = gep->getOperand(2);
-        assert(*scaleType == NULL || *scaleType == targetType);
-        *scaleType = targetType;
-    }
-
-    llvm::ExtractValueInst *ee = 
-        llvm::dyn_cast<llvm::ExtractValueInst>(gep->getOperand(0));
-    if (ee == NULL) {
-        // found the base pointer, here it is...
-        return gep->getOperand(0);
-    }
-    else {
-        assert(ee->hasIndices() && ee->getNumIndices() == 1 &&
-               int(*(ee->idx_begin())) == lane);
-        llvm::InsertValueInst *iv =
-            llvm::dyn_cast<llvm::InsertValueInst>(ee->getOperand(0));
-        assert(iv != NULL);
-        // do this chain of inserts for the next dimension...
-        return iv;
-    }
-}
-
-
-/** We have an LLVM array of pointer values, where each pointer has been
-    computed with a GEP from some common base pointer value.  This function
-    deconstructs the LLVM array, storing the offset from the base pointer
-    as an llvm::Value for the i'th element into the i'th element of the
-    offsets[] array passed in to the function.  It returns a scale factor
-    for the offsets via *scaleType.  The return value is either the base
-    pointer or the an array of pointers for the next dimension of indexing
-    (that we'll in turn deconstruct with this function).
+    and initializes the provided elements array such that the i'th
+    llvm::Value * in the array is the element that was inserted into the
+    i'th element of the vector.
 */
+static void
+lFlattenInsertChain(llvm::InsertElementInst *ie, int vectorWidth,
+                    llvm::Value **elements) {
+    for (int i = 0; i < vectorWidth; ++i)
+        elements[i] = NULL;
+
+    while (ie != NULL) {
+        int64_t iOffset = lGetIntValue(ie->getOperand(2));
+        assert(iOffset >= 0 && iOffset < vectorWidth);
+        assert(elements[iOffset] == NULL);
+
+        elements[iOffset] = ie->getOperand(1);
+
+        llvm::Value *insertBase = ie->getOperand(0);
+        ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
+        if (ie == NULL)
+            assert(llvm::isa<llvm::UndefValue>(insertBase));
+    }
+}
+
+/** Given a llvm::Value representing a varying pointer, this function
+    checks to see if all of the elements of the vector have the same value
+    (i.e. there's a common base pointer).  If so, it returns the common
+    pointer value; otherwise it returns NULL.
+ */
 static llvm::Value *
-lTraverseInsertChain(llvm::Value *ptrs, llvm::Value *offsets[ISPC_MAX_NVEC],
-                     LLVM_TYPE_CONST llvm::Type **scaleType,
-                     llvm::Instruction *insertBefore) {
-    // The pointer values may be an array of constant pointers (this
-    // happens, for example, when indexing into global arrays.)  In that
-    // case, we have llvm::ConstantExprs to deconstruct to dig out the
-    // common base pointer and the per-lane offsets.
-    llvm::ConstantArray *ca = llvm::dyn_cast<llvm::ConstantArray>(ptrs);
-    if (ca != NULL) {
-        assert((int)ca->getNumOperands() == g->target.vectorWidth);
-        llvm::Value *base = NULL;
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            llvm::Value *b = lTraverseConstantExpr(ca->getOperand(i), &offsets[i],
-                                                   scaleType, insertBefore);
-            if (i == 0) 
-                base = b;
-            else
-                assert(base == b);
-        }
+lGetBasePointer(llvm::Value *v) {
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        lFlattenInsertChain(ie, g->target.vectorWidth, elements);
+
+        // Make sure none of the elements is undefined.
+        // TODO: it's probably ok to allow undefined elements and return
+        // the base pointer if all of the other elements have the same
+        // value.
+        for (int i = 0; i < g->target.vectorWidth; ++i)
+            if (elements[i] == NULL)
+                return NULL;
+
+        // Do all of the elements have the same value?
+        for (int i = 0; i < g->target.vectorWidth-1; ++i)
+            if (elements[i] != elements[i+1])
+                return NULL;
+
+        return elements[0];
+    }
+
+    // This case comes up with global/static arrays
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv)
+        return cv->getSplatValue();
+
+    return NULL;
+}
+
+
+/** Given a varying pointer in ptrs, this function checks to see if it can
+    be determined to be indexing from a common uniform base pointer.  If
+    so, the function returns the base pointer llvm::Value and initializes
+    *offsets with an int vector of the per-lane offsets
+ */
+static llvm::Value *
+lGetBasePtrAndOffsets(llvm::Value *ptrs, llvm::Value **offsets) {
+    llvm::Value *base = lGetBasePointer(ptrs);
+    if (base != NULL) {
+        // We have a straight up varying pointer with no indexing that's
+        // actually all the same value.
+        if (g->target.is32Bit)
+            *offsets = LLVMInt32Vector(0);
+        else
+            *offsets = LLVMInt64Vector((int64_t)0);
         return base;
     }
 
-    // This depends on the front-end constructing the arrays of pointers
-    // via InsertValue instructions.  (Which it does do in
-    // FunctionEmitContext::GetElementPtrInst()).
-    llvm::InsertValueInst *ivInst = llvm::dyn_cast<llvm::InsertValueInst>(ptrs);
-    assert(ivInst != NULL);
-
-    // We have a chain of insert value instructions where each instruction
-    // sets one of the elements of the array and where the input array is
-    // either the base pointer or another insert value instruction.  Here
-    // we talk through all of the insert value instructions until we hit
-    // the end.
-    llvm::Value *nextChain = NULL;
-    while (ivInst != NULL) {
-        // Figure out which array index this current instruction is setting
-        // the value of.
-        assert(ivInst->hasIndices() && ivInst->getNumIndices() == 1);
-        int elementIndex = *(ivInst->idx_begin());
-        assert(elementIndex >= 0 && elementIndex < g->target.vectorWidth);
-        // We shouldn't have already seen something setting the value for
-        // this index.
-        assert(offsets[elementIndex] == NULL);
-
-        // Set offsets[elementIndex] here.  This returns the value from
-        // which the GEP operation was done; this should either be the base
-        // pointer or an insert value chain for another dimension of the
-        // array being indexed into.
-        llvm::Value *myNext = lGetOffsetForLane(elementIndex, ivInst->getOperand(1), 
-                                                &offsets[elementIndex], scaleType,
-                                                insertBefore);
-        if (nextChain == NULL)
-            nextChain = myNext;
-        else
-            // All of these insert value instructions should have the same
-            // base value
-            assert(nextChain == myNext);
-
-        // Do we have another element of the array to process?
-        llvm::Value *nextInsert = ivInst->getOperand(0);
-        ivInst = llvm::dyn_cast<llvm::InsertValueInst>(nextInsert);
-        if (!ivInst)
-            assert(llvm::isa<llvm::UndefValue>(nextInsert));
-    }
-    return nextChain;
-}
-
-
-/** Given a scalar value, return a vector of width g->target.vectorWidth
-    that has the scalar replicated across each of its elements.
-
-    @todo Using shufflevector to do this seems more idiomatic (and would be
-    just a single instruction).  Switch to that?
-*/
-static llvm::Value *
-lSmearScalar(llvm::Value *scalar, llvm::Instruction *insertBefore) {
-    LLVM_TYPE_CONST llvm::Type *vectorType = llvm::VectorType::get(scalar->getType(), 
-                                                                   g->target.vectorWidth);
-    llvm::Value *result = llvm::UndefValue::get(vectorType);
-    for (int i = 0; i < g->target.vectorWidth; ++i) {
-        result = llvm::InsertElementInst::Create(result, scalar, LLVMInt32(i),
-                                                 "smearinsert", insertBefore);
-        lCopyMetadata(result, insertBefore);
-    }
-    return result;
-}
-
-
-static llvm::Value *
-lGetPtrAndOffsets(llvm::Value *ptrs, llvm::Value **basePtr, 
-                  llvm::Instruction *insertBefore, int eltSize) {
-    llvm::Value *offset = LLVMInt32Vector(0);
-
-    while (ptrs != NULL) {
-        llvm::Value *offsets[ISPC_MAX_NVEC];
-        for (int i = 0; i < g->target.vectorWidth; ++i)
-            offsets[i] = NULL;
-        LLVM_TYPE_CONST llvm::Type *scaleType = NULL;
-
-        llvm::Value *nextChain = 
-            lTraverseInsertChain(ptrs, offsets, &scaleType, insertBefore);
-
-        for (int i = 0; i < g->target.vectorWidth; ++i)
-            assert(offsets[i] != NULL);
-        llvm::Value *delta = llvm::UndefValue::get(LLVMTypes::Int32VectorType);
-        for (int i = 0; i < g->target.vectorWidth; ++i) {
-            delta = llvm::InsertElementInst::Create(delta, offsets[i],
-                                                    LLVMInt32(i), "dim",
-                                                    insertBefore);
-            lCopyMetadata(delta, insertBefore);
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(ptrs);
+    if (bop != NULL && bop->getOpcode() == llvm::Instruction::Add) {
+        // If we have a common pointer plus something, then we're also
+        // good.
+        if ((base = lGetBasePointer(bop->getOperand(0))) != NULL) {
+            *offsets = bop->getOperand(1);
+            return base;
         }
-
-        llvm::Value *size = lGetTypeSize(scaleType, insertBefore);
-
-        llvm::Value *scale = lSmearScalar(size, insertBefore);
-        delta = llvm::BinaryOperator::Create(llvm::Instruction::Mul, delta, 
-                                             scale, "delta_scale", insertBefore);
-        lCopyMetadata(delta, insertBefore);
-        offset = llvm::BinaryOperator::Create(llvm::Instruction::Add, offset, 
-                                              delta, "offset_delta", 
-                                              insertBefore);
-        lCopyMetadata(offset, insertBefore);
-
-        if (llvm::dyn_cast<llvm::InsertValueInst>(nextChain))
-            ptrs = nextChain;
-        else {
-            // else we don't have a unique starting pointer....
-            assert(*basePtr == NULL || *basePtr == nextChain);
-            *basePtr = nextChain;
-            break;
+        else if ((base = lGetBasePointer(bop->getOperand(1))) != NULL) {
+            *offsets = bop->getOperand(0);
+            return base;
         }
     }
-
-    return offset;
+        
+    return NULL;
 }
 
 
 struct GSInfo {
-    GSInfo(const char *pgFuncName, const char *pgboFuncName, bool ig, int es) 
-        : isGather(ig), elementSize(es) {
+    GSInfo(const char *pgFuncName, const char *pgboFuncName, 
+           const char *pgbo32FuncName, bool ig) 
+        : isGather(ig) {
         func = m->module->getFunction(pgFuncName);
         baseOffsetsFunc = m->module->getFunction(pgboFuncName);
+        baseOffsets32Func = m->module->getFunction(pgbo32FuncName);
     }
     llvm::Function *func;
-    llvm::Function *baseOffsetsFunc;
+    llvm::Function *baseOffsetsFunc, *baseOffsets32Func;
     const bool isGather;
-    const int elementSize;
 };
 
 
 bool
 GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
     GSInfo gsFuncs[] = {
-        GSInfo("__pseudo_gather_8",  "__pseudo_gather_base_offsets_8",  true, 1),
-        GSInfo("__pseudo_gather_16", "__pseudo_gather_base_offsets_16", true, 2),
-        GSInfo("__pseudo_gather_32", "__pseudo_gather_base_offsets_32", true, 4),
-        GSInfo("__pseudo_gather_64", "__pseudo_gather_base_offsets_64", true, 8),
-        GSInfo("__pseudo_scatter_8",  "__pseudo_scatter_base_offsets_8",  false, 1),
-        GSInfo("__pseudo_scatter_16", "__pseudo_scatter_base_offsets_16", false, 2),
-        GSInfo("__pseudo_scatter_32", "__pseudo_scatter_base_offsets_32", false, 4),
-        GSInfo("__pseudo_scatter_64", "__pseudo_scatter_base_offsets_64", false, 8),
+        GSInfo("__pseudo_gather32_8",  "__pseudo_gather_base_offsets32_8",
+               "__pseudo_gather_base_offsets32_8", true),
+        GSInfo("__pseudo_gather32_16", "__pseudo_gather_base_offsets32_16", 
+               "__pseudo_gather_base_offsets32_16", true),
+        GSInfo("__pseudo_gather32_32", "__pseudo_gather_base_offsets32_32", 
+               "__pseudo_gather_base_offsets32_32", true),
+        GSInfo("__pseudo_gather32_64", "__pseudo_gather_base_offsets32_64", 
+               "__pseudo_gather_base_offsets32_64", true),
+        GSInfo("__pseudo_scatter32_8",  "__pseudo_scatter_base_offsets32_8", 
+               "__pseudo_scatter_base_offsets32_8", false),
+        GSInfo("__pseudo_scatter32_16", "__pseudo_scatter_base_offsets32_16", 
+               "__pseudo_scatter_base_offsets32_16", false),
+        GSInfo("__pseudo_scatter32_32", "__pseudo_scatter_base_offsets32_32", 
+               "__pseudo_scatter_base_offsets32_32", false),
+        GSInfo("__pseudo_scatter32_64", "__pseudo_scatter_base_offsets32_64", 
+               "__pseudo_scatter_base_offsets32_64", false),
+        GSInfo("__pseudo_gather64_8",  "__pseudo_gather_base_offsets64_8", 
+               "__pseudo_gather_base_offsets32_8", true),
+        GSInfo("__pseudo_gather64_16", "__pseudo_gather_base_offsets64_16", 
+               "__pseudo_gather_base_offsets32_16", true),
+        GSInfo("__pseudo_gather64_32", "__pseudo_gather_base_offsets64_32", 
+               "__pseudo_gather_base_offsets32_32", true),
+        GSInfo("__pseudo_gather64_64", "__pseudo_gather_base_offsets64_64", 
+               "__pseudo_gather_base_offsets32_64", true),
+        GSInfo("__pseudo_scatter64_8",  "__pseudo_scatter_base_offsets64_8", 
+               "__pseudo_scatter_base_offsets32_8", false),
+        GSInfo("__pseudo_scatter64_16", "__pseudo_scatter_base_offsets64_16", 
+               "__pseudo_scatter_base_offsets32_16", false),
+        GSInfo("__pseudo_scatter64_32", "__pseudo_scatter_base_offsets64_32", 
+               "__pseudo_scatter_base_offsets32_32", false),
+        GSInfo("__pseudo_scatter64_64", "__pseudo_scatter_base_offsets64_64", 
+               "__pseudo_scatter_base_offsets32_64", false),
     };
     int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]);
     for (int i = 0; i < numGSFuncs; ++i)
-        assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL);
+        assert(gsFuncs[i].func != NULL && gsFuncs[i].baseOffsetsFunc != NULL &&
+               gsFuncs[i].baseOffsets32Func != NULL);
 
     bool modifiedAny = false;
  restart:
@@ -1090,19 +936,44 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
         if (info == NULL)
             continue;
 
-        // Transform the array of pointers to a single base pointer and an
-        // array of int32 offsets.  (All the hard work is done by
-        // lGetPtrAndOffsets).
+        // Try to transform the array of pointers to a single base pointer
+        // and an array of int32 offsets.  (All the hard work is done by
+        // lGetBasePtrAndOffsets).
         llvm::Value *ptrs = callInst->getArgOperand(0);
-        llvm::Value *basePtr = NULL;
-        llvm::Value *offsetVector = lGetPtrAndOffsets(ptrs, &basePtr, callInst, 
-                                                      info->elementSize);
+        llvm::Value *offsetVector = NULL;
+        llvm::Value *basePtr = lGetBasePtrAndOffsets(ptrs, &offsetVector);
+
+        if (basePtr == NULL || offsetVector == NULL)
+            // It's actually a fully general gather/scatter with a varying
+            // set of base pointers, so leave it as is and continune onward
+            // to the next instruction...
+            continue;
+
         // Cast the base pointer to a void *, since that's what the
         // __pseudo_*_base_offsets_* functions want.
-        basePtr = new llvm::BitCastInst(basePtr, LLVMTypes::VoidPointerType,
-                                        "base2void", callInst);
+        basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType,
+                                         "base2void", callInst);
         lCopyMetadata(basePtr, callInst);
 
+        llvm::Function *gatherScatterFunc = info->baseOffsetsFunc;
+
+        if (g->opt.force32BitAddressing) {
+            // If we're doing 32-bit addressing on a 64-bit target, here we
+            // will see if we can call one of the 32-bit variants of the
+            // pseudo gather/scatter functions.  Specifically, if the
+            // offset vector turns out to be an i32 value that was sext'ed
+            // to be i64 immediately before the scatter/gather, then we
+            // walk past the sext to get the i32 offset values and then
+            // call out to the corresponding 32-bit gather/scatter
+            // function.
+            llvm::SExtInst *sext = llvm::dyn_cast<llvm::SExtInst>(offsetVector);
+            if (sext != NULL && 
+                sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) {
+                offsetVector = sext->getOperand(0);
+                gatherScatterFunc = info->baseOffsets32Func;
+            }
+        }
+
         if (info->isGather) {
             llvm::Value *mask = callInst->getArgOperand(1);
 
@@ -1115,11 +986,11 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[3]);
             llvm::Instruction *newCall = 
-                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray,
-                                       "newgather", (llvm::Instruction *)NULL);
+                llvm::CallInst::Create(gatherScatterFunc, newArgArray, "newgather",
+                                       (llvm::Instruction *)NULL);
 #else
             llvm::Instruction *newCall = 
-                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], &newArgs[3],
+                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], &newArgs[3],
                                        "newgather");
 #endif
             lCopyMetadata(newCall, callInst);
@@ -1136,11 +1007,11 @@ GatherScatterFlattenOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> newArgArray(&newArgs[0], &newArgs[4]);
             llvm::Instruction *newCall = 
-                llvm::CallInst::Create(info->baseOffsetsFunc, newArgArray, "", 
+                llvm::CallInst::Create(gatherScatterFunc, newArgArray, "", 
                                        (llvm::Instruction *)NULL);
 #else
             llvm::Instruction *newCall = 
-                llvm::CallInst::Create(info->baseOffsetsFunc, &newArgs[0], 
+                llvm::CallInst::Create(gatherScatterFunc, &newArgs[0], 
                                        &newArgs[4]);
 #endif
             lCopyMetadata(newCall, callInst);
@@ -1280,9 +1151,8 @@ CreateMaskedStoreOptPass() {
 // LowerMaskedStorePass
 
 /** When the front-end needs to do a masked store, it emits a
-    __pseudo_masked_store_{8,16,32,64} call as a placeholder.  This pass
-    lowers these calls to either __masked_store_{8,16,32,64} or
-    __masked_store_blend_{8,16,32,64} calls.
+    __pseudo_masked_store* call as a placeholder.  This pass lowers these
+    calls to either __masked_store* or __masked_store_blend* calls.  
 */
 class LowerMaskedStorePass : public llvm::BasicBlockPass {
 public:
@@ -1445,346 +1315,10 @@ llvm::RegisterPass<GSImprovementsPass> gsi("gs-improvements",
                                            "Gather/Scatter Improvements Pass");
 
 
-#if 0
-// Debugging routine: dump the values of all of the elmenets in a
-// flattened-out vector
-static void lPrintVector(const char *info, llvm::Value *elements[ISPC_MAX_NVEC]) {
-    fprintf(stderr, "--- %s ---\n", info);
-    for (int i = 0; i < g->target.vectorWidth; ++i) {
-        fprintf(stderr, "%d: ", i);
-        elements[i]->dump();
-    }
-    fprintf(stderr, "-----\n");
-}
-#endif
-
-
-/** Given an LLVM vector in vec, return a 'scalarized' version of the
-    vector in the provided scalarizedVector[] array.  For example, if the
-    vector value passed in is:
-
-    add <4 x i32> %a_smear, <4 x i32> <4, 8, 12, 16>,
-
-    and if %a_smear was computed by replicating a scalar value i32 %a
-    across all of the elements of %a_smear, then the values returned will
-    be:
-
-    offsets[0] = add i32 %a, i32 4
-    offsets[1] = add i32 %a, i32 8
-    offsets[2] = add i32 %a, i32 12
-    offsets[3] = add i32 %a, i32 16
-    
-    This function isn't fully general, but it seems to be able to handle
-    all of the patterns that currently arise in practice.  If it can't
-    scalarize a vector value, then it just returns false and the calling
-    code proceeds as best it can without this information.
-
-    @param vec               Vector to be scalarized
-    @param scalarizedVector  Array in which to store the individual vector 
-                             elements
-    @param vectorLength      Number of elements in the given vector. (The
-                             passed scalarizedVector array must also be at least
-                             this length as well.)
-    @param phiMap            STL map from pointers to PHINodes that we've already
-                             scalarized to the array of Value *s that they were
-                             scalarized into.
-    @returns                 True if the vector was successfully scalarized and
-    the values in offsets[] are valid; false otherwise
-*/
-static bool
-lScalarizeVector(llvm::Value *vec, llvm::Value **scalarizedVector,
-                 int vectorLength, std::map<llvm::PHINode *, llvm::Value **> &phiMap) {
-    // First initialize the values of scalarizedVector[] to NULL.
-    for (int i = 0; i < vectorLength; ++i)
-        scalarizedVector[i] = NULL;
-    
-    // It may be ok for the vector to be an undef vector; these come up for
-    // example in shufflevector instructions.  As long as elements of the
-    // undef vector aren't referenced by the shuffle indices, this is fine.
-    if (llvm::isa<llvm::UndefValue>(vec))
-        return true;
-
-    // ConstantVectors are easy; just pull out the individual constant
-    // element values
-    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(vec);
-    if (cv != NULL) {
-        for (int i = 0; i < vectorLength; ++i)
-            scalarizedVector[i] = cv->getOperand(i);
-        return true;
-    }
-
-    // It's also easy if it's just a vector of all zeros
-    llvm::ConstantAggregateZero *caz = 
-        llvm::dyn_cast<llvm::ConstantAggregateZero>(vec);
-    if (caz != NULL) {
-        for (int i = 0; i < vectorLength; ++i)
-            scalarizedVector[i] = LLVMInt32(0);
-        return true;
-    }
-
-    llvm::BinaryOperator *bo = llvm::dyn_cast<llvm::BinaryOperator>(vec);
-    if (bo) {
-        // BinaryOperators are handled by attempting to scalarize both of
-        // the operands.  If we're successful at this, then the vector of
-        // scalar values we return from here are synthesized with scalar
-        // versions of the original vector binary operator
-        llvm::Instruction::BinaryOps opcode = bo->getOpcode();
-        llvm::Value **v0 = 
-            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-        llvm::Value **v1 = 
-            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-
-        if (!lScalarizeVector(bo->getOperand(0), v0, vectorLength, phiMap) || 
-            !lScalarizeVector(bo->getOperand(1), v1, vectorLength, phiMap))
-            return false;
-
-        for (int i = 0; i < vectorLength; ++i) {
-            scalarizedVector[i] = 
-                llvm::BinaryOperator::Create(opcode, v0[i], v1[i], "flat_bop", bo);
-            lCopyMetadata(scalarizedVector[i], bo);
-        }
-
-        return true;
-    }
-
-    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(vec);
-    if (ie != NULL) {
-        // If we have an InsertElement instruction, we generally have a
-        // chain along the lines of:
-        //
-        // %v0 = insertelement undef, value_0, i32 index_0
-        // %v1 = insertelement %v1,   value_1, i32 index_1
-        // ...
-        // %vn = insertelement %vn-1, value_n-1, i32 index_n-1
-        //
-        // We start here witn %vn and work backwards through the chain of
-        // insertelement instructions until we get to the undef value that
-        // started it all.  At each instruction, we set the appropriate
-        // vaue in scalarizedVector[] based on the value being inserted.
-        while (ie != NULL) {
-            uint64_t iOffset = lGetIntValue(ie->getOperand(2));
-            assert((int)iOffset < vectorLength);
-            assert(scalarizedVector[iOffset] == NULL);
-
-            scalarizedVector[iOffset] = ie->getOperand(1);
-
-            llvm::Value *insertBase = ie->getOperand(0);
-            ie = llvm::dyn_cast<llvm::InsertElementInst>(insertBase);
-            if (!ie)
-                assert(llvm::isa<llvm::UndefValue>(insertBase));
-        }
-        return true;
-    }
-
-    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(vec);
-    if (ci != NULL) {
-        // Casts are similar to BinaryOperators in that we attempt to
-        // scalarize the vector being cast and if successful, we apply
-        // equivalent scalar cast operators to each of the values in the
-        // scalarized vector.
-        llvm::Instruction::CastOps op = ci->getOpcode();
-
-        llvm::Value **scalarizedTarget = 
-            (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-        if (!lScalarizeVector(ci->getOperand(0), scalarizedTarget,
-                              vectorLength, phiMap))
-            return false;
-
-        LLVM_TYPE_CONST llvm::Type *destType = ci->getDestTy();
-        LLVM_TYPE_CONST llvm::VectorType *vectorDestType =
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(destType);
-        assert(vectorDestType != NULL);
-        LLVM_TYPE_CONST llvm::Type *elementType = vectorDestType->getElementType();
-
-        for (int i = 0; i < vectorLength; ++i) {
-            scalarizedVector[i] = 
-                llvm::CastInst::Create(op, scalarizedTarget[i], elementType,
-                                       "cast", ci);
-            lCopyMetadata(scalarizedVector[i], ci);
-        }
-        return true;
-    }
-
-    llvm::ShuffleVectorInst *svi = llvm::dyn_cast<llvm::ShuffleVectorInst>(vec);
-    if (svi != NULL) {
-        LLVM_TYPE_CONST llvm::VectorType *svInstType = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(svi->getType());
-        assert(svInstType != NULL);
-        assert((int)svInstType->getNumElements() == vectorLength);
-
-        // Scalarize the two vectors being shuffled.  First figure out how
-        // big they are.
-        LLVM_TYPE_CONST llvm::Type *type0 = svi->getOperand(0)->getType();
-        LLVM_TYPE_CONST llvm::Type *type1 = svi->getOperand(1)->getType();
-        LLVM_TYPE_CONST llvm::VectorType *vectorType0 = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type0);
-        LLVM_TYPE_CONST llvm::VectorType *vectorType1 = 
-            llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type1);
-        assert(vectorType0 != NULL && vectorType1 != NULL);
-
-        int n0 = vectorType0->getNumElements();
-        int n1 = vectorType1->getNumElements();
-
-        // Go ahead and scalarize the two input vectors now.
-        llvm::Value **v0 = (llvm::Value **)alloca(n0 * sizeof(llvm::Value *));
-        llvm::Value **v1 = (llvm::Value **)alloca(n1 * sizeof(llvm::Value *));
-
-        if (!lScalarizeVector(svi->getOperand(0), v0, n0, phiMap) ||
-            !lScalarizeVector(svi->getOperand(1), v1, n1, phiMap))
-            return false;
-
-        llvm::ConstantAggregateZero *caz = 
-            llvm::dyn_cast<llvm::ConstantAggregateZero>(svi->getOperand(2));
-        if (caz != NULL) {
-            for (int i = 0; i < vectorLength; ++i)
-                scalarizedVector[i] = v0[0];
-        }
-        else {
-            llvm::ConstantVector *shuffleIndicesVector = 
-                llvm::dyn_cast<llvm::ConstantVector>(svi->getOperand(2));
-            // I think this has to be a ConstantVector.  If this ever hits,
-            // we'll dig into what we got instead and figure out how to handle
-            // that...
-            assert(shuffleIndicesVector != NULL);
-
-            // Get the integer indices for each element of the returned vector
-            llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> shuffleIndices;
-            shuffleIndicesVector->getVectorElements(shuffleIndices);
-            assert((int)shuffleIndices.size() == vectorLength);
-
-            // And loop over the indices, setting the i'th element of the
-            // result vector with the source vector element that corresponds to
-            // the i'th shuffle index value.
-            for (unsigned int i = 0; i < shuffleIndices.size(); ++i) {
-                // I'm not sure when this case would ever happen, though..
-                assert(llvm::isa<llvm::ConstantInt>(shuffleIndices[i]));
-
-                int offset = (int)lGetIntValue(shuffleIndices[i]);
-                assert(offset >= 0 && offset < n0+n1);
-
-                if (offset < n0)
-                    // Offsets from 0 to n0-1 index into the first vector
-                    scalarizedVector[i] = v0[offset];
-                else
-                    // And offsets from n0 to (n0+n1-1) index into the second
-                    // vector
-                    scalarizedVector[i] = v1[offset - n0];
-            }
-        }
-        return true;
-    }
-
-    llvm::LoadInst *li = llvm::dyn_cast<llvm::LoadInst>(vec);
-    if (li != NULL) {
-        llvm::Value *baseAddr = li->getOperand(0);
-        LLVM_TYPE_CONST llvm::Type *intPtrType = g->target.is32bit ? 
-            LLVMTypes::Int32Type : LLVMTypes::Int64Type;
-        llvm::Value *baseInt = new llvm::PtrToIntInst(baseAddr, intPtrType,
-                                                      "base2int", li);
-        lCopyMetadata(baseInt, li);
-
-        LLVM_TYPE_CONST llvm::PointerType *ptrType = 
-            llvm::dyn_cast<llvm::PointerType>(baseAddr->getType());
-        assert(ptrType != NULL);
-        LLVM_TYPE_CONST llvm::VectorType *vecType = 
-            llvm::dyn_cast<llvm::VectorType>(ptrType->getElementType());
-        assert(vecType != NULL);
-        LLVM_TYPE_CONST llvm::Type *elementType = vecType->getElementType();
-
-        llvm::Value *elementSize = g->target.SizeOf(elementType);
-
-        LLVM_TYPE_CONST llvm::Type *eltPtrType = llvm::PointerType::get(elementType, 0);
-
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::Value *offset =
-                llvm::BinaryOperator::Create(llvm::Instruction::Mul, elementSize,
-                                             g->target.is32bit ? LLVMInt32(i) :
-                                                                 LLVMInt64(i),
-                                             "elt_offset", li);
-            llvm::Value *intPtrOffset = 
-                llvm::BinaryOperator::Create(llvm::Instruction::Add, baseInt,
-                                             offset, "baseoffset", li);
-            lCopyMetadata(intPtrOffset, li);
-            llvm::Value *scalarLoadPtr = 
-                new llvm::IntToPtrInst(intPtrOffset, eltPtrType, "int2ptr", li);
-            lCopyMetadata(scalarLoadPtr, li);
-
-            llvm::Instruction *scalarLoad = 
-                new llvm::LoadInst(scalarLoadPtr, "loadelt", li);
-            lCopyMetadata(scalarLoad, li);
-            scalarizedVector[i] = scalarLoad;
-        }
-        return true;
-    }
-
-    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(vec);
-    if (phi != NULL) {
-        // If we've seen this phi node during an earlier recursive step,
-        // then don't re-scalarize it, but return the already allocated
-        // pointer values.  (This both avoids an infinite loop and ensures
-        // that we point back to ourself properly for cases where some of
-        // the incoming values end up being derived from the phi node...
-        if (phiMap.find(phi) != phiMap.end()) {
-            llvm::Value **v = phiMap[phi];
-            for (int i = 0; i < vectorLength; ++i)
-                scalarizedVector[i] = v[i];
-            return true;
-        }
-
-        LLVM_TYPE_CONST llvm::VectorType *vecType = 
-            llvm::dyn_cast<llvm::VectorType>(phi->getType());
-        LLVM_TYPE_CONST llvm::Type *eltType = vecType->getElementType();
-        assert(vecType != NULL);
-        unsigned int numIncoming = phi->getNumIncomingValues();
-
-        // First allocate all of the scalarized phi nodes, so that we can
-        // get them into the map<> before making recursive calls to
-        // lScalarizeVector.
-        for (int i = 0; i < vectorLength; ++i) {
-#if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
-            scalarizedVector[i] =
-                llvm::PHINode::Create(eltType, numIncoming, "phi", phi);
-#else
-            scalarizedVector[i] =
-                llvm::PHINode::Create(eltType, "phi", phi);
-#endif // LLVM_3_0
-            lCopyMetadata(scalarizedVector[i], phi);
-        }
-
-        phiMap[phi] = scalarizedVector;
-
-        llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-        // Now, for each incoming value, scalarize it recursively and then
-        // update the scalarized phi node for this element.
-        for (unsigned int i = 0; i < numIncoming; ++i) {
-            if (!lScalarizeVector(phi->getIncomingValue(i), vin, vectorLength, phiMap))
-                return false;
-            llvm::BasicBlock *bbin = phi->getIncomingBlock(i);
-
-            for (int j = 0; j < vectorLength; ++j) {
-                llvm::PHINode *phi = (llvm::PHINode *)scalarizedVector[j];
-                phi->addIncoming(vin[j], bbin);
-            }
-        }
-
-        phiMap.erase(phiMap.find(phi));
-        return true;
-    }
-
-#if 0
-    fprintf(stderr, "flatten vector fixme\n");
-    vec->dump();
-    assert(0);
-#endif
-
-    return false;
-}
-
-
-/** Conservative test to see if two values are equal.  There are
+/** Conservative test to see if two llvm::Values are equal.  There are
     (potentially many) cases where the two values actually are equal but
     this will return false.  However, if it does return true, the two
-    vectors definitely are equal.  
+    vectors definitely are equal.
 
     @todo This seems to catch all of the cases we currently need it for in
     practice, but it's be nice to make it a little more robust/general.  In
@@ -1821,8 +1355,7 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
     llvm::PHINode *phi0 = llvm::dyn_cast<llvm::PHINode>(v0);
     llvm::PHINode *phi1 = llvm::dyn_cast<llvm::PHINode>(v1);
     if (phi0 != NULL && phi1 != NULL) {
-        if (phi0->getNumIncomingValues() !=
-            phi1->getNumIncomingValues())
+        if (phi0->getNumIncomingValues() != phi1->getNumIncomingValues())
             return false;
 
         seenPhi0.push_back(phi0);
@@ -1851,41 +1384,139 @@ lValuesAreEqual(llvm::Value *v0, llvm::Value *v1,
 }
 
 
-/** Tests to see if all of the llvm::Values in the array are equal.  Like
-    lValuesAreEqual, this is a conservative test and may return false for
-    arrays where the values are actually all equal.
-*/
+/** Tests to see if all of the elements of the vector in the 'v' parameter
+    are equal.  Like lValuesAreEqual(), this is a conservative test and may
+    return false for arrays where the values are actually all equal.  */
 static bool
-lVectorValuesAllEqual(llvm::Value **v, int vectorLength) {
-    std::vector<llvm::PHINode *> seenPhi0, seenPhi1;
-    for (int i = 0; i < vectorLength-1; ++i)
-        if (!lValuesAreEqual(v[i], v[i+1], seenPhi0, seenPhi1))
-            return false;
-    return true;
+lVectorValuesAllEqual(llvm::Value *v, int vectorLength,
+                      std::vector<llvm::PHINode *> &seenPhis) {
+    if (llvm::isa<llvm::ConstantAggregateZero>(v))
+        return true;
+
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv != NULL)
+        return (cv->getSplatValue() != NULL);
+
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL)
+        return (lVectorValuesAllEqual(bop->getOperand(0), vectorLength, 
+                                      seenPhis) &&
+                lVectorValuesAllEqual(bop->getOperand(1), vectorLength, 
+                                      seenPhis));
+
+    llvm::CastInst *cast = llvm::dyn_cast<llvm::CastInst>(v);
+    if (cast != NULL)
+        return lVectorValuesAllEqual(cast->getOperand(0), vectorLength, 
+                                     seenPhis);
+
+    llvm::InsertElementInst *ie = llvm::dyn_cast<llvm::InsertElementInst>(v);
+    if (ie != NULL) {
+        llvm::Value *elements[ISPC_MAX_NVEC];
+        lFlattenInsertChain(ie, vectorLength, elements);
+
+        for (int i = 0; i < vectorLength-1; ++i) {
+            // TODO: It's not clear what to do in this case (which
+            // corresponds to elements of the vector being undef).  It is
+            // probably to just ignore undef elements and return true if
+            // all of the other ones are equal, but it'd be nice to have
+            // some test cases to verify this.
+            assert(elements[i] != NULL && elements[i+1] != NULL);
+
+            std::vector<llvm::PHINode *> seenPhi0;
+            std::vector<llvm::PHINode *> seenPhi1;
+            if (lValuesAreEqual(elements[i], elements[i+1], seenPhi0, 
+                                seenPhi1) == false)
+                return false;
+        }
+        return true;
+    }
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi) {
+        for (unsigned int i = 0; i < seenPhis.size(); ++i)
+            if (seenPhis[i] == phi)
+                return true;
+
+        seenPhis.push_back(phi);
+
+        unsigned int numIncoming = phi->getNumIncomingValues();
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
+        for (unsigned int i = 0; i < numIncoming; ++i) {
+            if (!lVectorValuesAllEqual(phi->getIncomingValue(i), vectorLength,
+                                       seenPhis)) {
+                seenPhis.pop_back();
+                return false;
+            }
+        }
+
+        seenPhis.pop_back();
+        return true;
+    }
+
+    assert(!llvm::isa<llvm::Constant>(v));
+
+    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v) ||
+        !llvm::isa<llvm::Instruction>(v))
+        return false;
+
+    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
+    if (shuffle != NULL) {
+        llvm::Value *indices = shuffle->getOperand(2);
+        if (lVectorValuesAllEqual(indices, vectorLength, seenPhis))
+            // The easy case--just a smear of the same element across the
+            // whole vector.
+            return true;
+
+        // TODO: handle more general cases?
+        return false;
+    }
+
+#if 0
+    fprintf(stderr, "all equal: ");
+    v->dump();
+    fprintf(stderr, "\n");
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst) {
+        inst->getParent()->dump();
+        fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
+    }
+#endif
+
+    return false;
 }
 
 
-/** Given an array of scalar integer values, test to see if they are a
-    linear sequence of compile-time constant integers starting from an
+/** Given a vector of compile-time constant integer values, test to see if
+    they are a linear sequence of constant integers starting from an
     arbirary value but then having a step of value "stride" between
     elements.
-*/
+ */
 static bool
-lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) {
-    llvm::ConstantInt *prev = llvm::dyn_cast<llvm::ConstantInt>(v[0]);
-    if (!prev)
+lVectorIsLinearConstantInts(llvm::ConstantVector *cv, int vectorLength, 
+                            int stride) {
+    // Flatten the vector out into the elements array
+    llvm::SmallVector<llvm::Constant *, ISPC_MAX_NVEC> elements;
+    cv->getVectorElements(elements);
+    assert((int)elements.size() == vectorLength);
+
+    llvm::ConstantInt *ci = llvm::dyn_cast<llvm::ConstantInt>(elements[0]);
+    if (ci == NULL)
+        // Not a vector of integers
         return false;
-    int prevVal = (int)prev->getZExtValue();
+
+    int64_t prevVal = ci->getSExtValue();
 
     // For each element in the array, see if it is both a ConstantInt and
     // if the difference between it and the value of the previous element
     // is stride.  If not, fail.
     for (int i = 1; i < vectorLength; ++i) {
-        llvm::ConstantInt *next = llvm::dyn_cast<llvm::ConstantInt>(v[i]);
-        if (!next) 
+        ci = llvm::dyn_cast<llvm::ConstantInt>(elements[i]);
+        if (ci == NULL) 
             return false;
 
-        int nextVal = (int)next->getZExtValue();
+        int64_t nextVal = ci->getSExtValue();
         if (prevVal + stride != nextVal)
             return false;
 
@@ -1895,315 +1526,141 @@ lVectorIsLinearConstantInts(llvm::Value **v, int vectorLength, int stride) {
 }
 
 
-/** Given an array of integer-typed values, see if the elements of the
-    array have a step of 'stride' between their values.  This function
-    tries to handle as many possibilities as possible, including things
-    like all elements equal to some non-constant value plus an integer
-    offset, etc.
+static bool lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
+                            std::vector<llvm::PHINode *> &seenPhis);
 
-    @todo FIXME Crazy thought: can we just build up expressions that
-    subtract the constants [v[0], v[0]+stride, v[0]+2*stride, ...] from the
-    given values, throw the LLVM optimizer at those, and then see if we get
-    back an array of all zeros?
+/** Checks to see if (op0 * op1) is a linear vector where the result is a
+    vector with values that increase by stride.
+ */
+static bool
+lCheckMulForLinear(llvm::Value *op0, llvm::Value *op1, int vectorLength, 
+                   int stride, std::vector<llvm::PHINode *> &seenPhis) {
+    // Is the first operand a constant integer value splatted across all of
+    // the lanes?
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(op0);
+    if (cv == NULL)
+        return false;
+    llvm::ConstantInt *splat = 
+        llvm::dyn_cast<llvm::ConstantInt>(cv->getSplatValue());
+    if (splat == NULL)
+        return false;
+
+    // If the splat value doesn't evenly divide the stride we're looking
+    // for, there's no way that we can get the linear sequence we're
+    // looking or.
+    int64_t splatVal = splat->getSExtValue();
+    if (splatVal == 0 || splatVal > stride || (stride % splatVal) != 0)
+        return false;
+
+    // Check to see if the other operand is a linear vector with stride
+    // given by stride/splatVal.
+    return lVectorIsLinear(op1, vectorLength, stride / splatVal, seenPhis);
+}
+
+
+/** Given vector of integer-typed values, see if the elements of the array
+    have a step of 'stride' between their values.  This function tries to
+    handle as many possibilities as possible, including things like all
+    elements equal to some non-constant value plus an integer offset, etc.
 */
 static bool
-lVectorIsLinear(llvm::Value **v, int vectorLength, int stride,
-                std::set<llvm::PHINode *> &seenPhis) {
-#if 0
-    lPrintVector("called lVectorIsLinear", v);
-#endif
-
+lVectorIsLinear(llvm::Value *v, int vectorLength, int stride,
+                std::vector<llvm::PHINode *> &seenPhis) {
     // First try the easy case: if the values are all just constant
     // integers and have the expected stride between them, then we're done.
-    if (lVectorIsLinearConstantInts(v, vectorLength, stride))
-        return true;
+    llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v);
+    if (cv != NULL)
+        return lVectorIsLinearConstantInts(cv, vectorLength, stride);
 
-    // ConstantExprs need a bit of deconstruction to figure out
+    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v);
+    if (bop != NULL) {
+        // FIXME: is it right to pass the seenPhis to the all equal check as well??
+        llvm::Value *op0 = bop->getOperand(0), *op1 = bop->getOperand(1);
 
-    // FIXME: do we need to handle cases where e.g. v[0] is an
-    // llvm::ConstantInt and then the rest are ConstExprs??
-    if (llvm::dyn_cast<llvm::ConstantExpr>(v[0])) {
-        // First, see if all of the array elements are ConstantExprs of
-        // some sort.  If not, give up.
-        // FIXME: are we potentially missing cases here, e.g. a mixture of
-        // ConstantExprs and ConstantInts?
-        for (int i = 0; i < vectorLength; ++i) {
-            if (!llvm::isa<llvm::ConstantExpr>(v[i]))
-                return false;
-        }
-
-        // See if any of the array elements are adds of constant
-        // expressions.  As it turns out, LLVM's constant expression
-        // optimizer is very thorough about converting "add(0, foo)" to
-        // "foo", so we need to deal with cases where element 0 is "foo",
-        // element 1 is add(4, foo), etc...
-        bool anyAdds = false, allAdds = true;
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
-            if (ce->getOpcode() == llvm::Instruction::Add ||
-                ce->getOpcode() == llvm::Instruction::Sub)
-                anyAdds = true;
-            else 
-                allAdds = false;
-        }
-
-        if (anyAdds && !allAdds) {
-            // In v[], we should have an array of elements that are all
-            // either ConstExprs with add operators, where one of the
-            // operads is a constant int, or other non-add ConstExpr
-            // values.  
-            // 
-            // Now we through each element and:
-            // 1. For ones that aren't add ConstExprs, treat them as if they 
-            //    are an add with 0 as the other operand.
-            // 2. Extract the ConstInt operand of the add into the intBit[]
-            //    array and put the other operand in the otherBit[] array.
-            llvm::Value **intBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-            llvm::Value **otherBit = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-            for (int i = 0; i < vectorLength; ++i) {
-                llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(v[i]);
-                if (ce->getOpcode() == llvm::Instruction::Add) {
-                    // The ConstantInt may be either of the two operands of
-                    // the add.  Put the operands in the right arrays.
-                    if (llvm::isa<llvm::ConstantInt>(ce->getOperand(0))) {
-                        intBit[i] = ce->getOperand(0);
-                        otherBit[i] = ce->getOperand(1);
-                    }
-                    else {
-                        intBit[i] = ce->getOperand(1);
-                        otherBit[i] = ce->getOperand(0);
-                    }
-                }
-                else if (ce->getOpcode() == llvm::Instruction::Sub) {
-                    // Treat subtraction as an add with a negative value..
-                    if (llvm::isa<llvm::ConstantInt>(ce->getOperand(0))) {
-                        intBit[i] = ce->getOperand(0);
-                        otherBit[i] = llvm::ConstantExpr::getNeg(ce->getOperand(1));
-                    }
-                    else {
-                        intBit[i] = ce->getOperand(1);
-                        otherBit[i] = llvm::ConstantExpr::getNeg(ce->getOperand(0));
-                    }
-                }
-                else {
-                    // We don't have an Add or a Sub, so pretend we have an
-                    // add with zero.
-                    intBit[i] = LLVMInt32(0);
-                    otherBit[i] = v[i];
-                }
-            }
-
-            // Now that everything is lined up, see if we have a case where
-            // we're adding constant values with the desired stride to the
-            // same base value.  If so, we know we have a linear set of
-            // locations.
-            return (lVectorIsLinear(intBit, vectorLength, stride, seenPhis) &&
-                    lVectorValuesAllEqual(otherBit, vectorLength));
-        }
-
-        // If this ever hits, the assertion can just be commented out and
-        // false returned below.  However, it's worth figuring out how the
-        // analysis needs to be generalized rather than necessarily giving
-        // up and possibly hurting performance of the final code.
-        FATAL("Unexpected case with a ConstantExpr in lVectorIsLinear");
-#if 0
-        for (int i = 0; i < vectorLength; ++i)
-            v[i]->dump();
-        FATAL("FIXME");
-#endif
-        return false;
-    }
-
-
-    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[0]);
-    if (bop) {
-        // We also need to deal with non-constant binary operators that
-        // represent linear accesses here..
-        // FIXME: here, too, what about cases with v[0] being a load or something
-        // and then everything after element 0 being a binary operator with an add.
-        // That won't get caught by this case??
-        bool anyAdd = false, anySub = false;
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::BinaryOperator *bopi = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
-            if (bopi) {
-                if (bopi->getOpcode() == llvm::Instruction::Add)
-                    anyAdd = true;
-                else if (bopi->getOpcode() == llvm::Instruction::Sub)
-                    anySub = true;
-            }
-        }
-
-        if (anyAdd && anySub)
+        if (bop->getOpcode() == llvm::Instruction::Add)
+            // There are two cases to check if we have an add:
+            //
+            // programIndex + unif -> ascending linear seqeuence
+            // unif + programIndex -> ascending linear sequence
+            return ((lVectorIsLinear(op0, vectorLength, stride, seenPhis) &&
+                     lVectorValuesAllEqual(op1, vectorLength, seenPhis)) ||
+                    (lVectorIsLinear(op1, vectorLength, stride, seenPhis) &&
+                     lVectorValuesAllEqual(op0, vectorLength, seenPhis)));
+        else if (bop->getOpcode() == llvm::Instruction::Sub)
+            // For subtraction, we only match:
+            //
+            // programIndex - unif -> ascending linear seqeuence
+            //
+            // In the future, we could also look for:
+            // unif - programIndex -> *descending* linear seqeuence
+            // And generate code for that as a vector load + shuffle.
+            return (lVectorIsLinear(bop->getOperand(0), vectorLength,
+                                    stride, seenPhis) &&
+                    lVectorValuesAllEqual(bop->getOperand(1), vectorLength,
+                                          seenPhis));
+        else if (bop->getOpcode() == llvm::Instruction::Mul)
+            // Multiplies are a bit trickier, so are handled in a separate
+            // function.
+            return (lCheckMulForLinear(op0, op1, vectorLength, stride, seenPhis) ||
+                    lCheckMulForLinear(op1, op0, vectorLength, stride, seenPhis));
+        else
             return false;
-        if (anyAdd || anySub) {
-            // is one of the operands the same for all elements?  if so, then just
-            // need to check this case for the other operand...
-
-            // FIXME: do we need a more general check that starts with both
-            // the first and second operand of v[0]'s add and then checks
-            // the remainder of the elements to see if either one of their
-            // two operands matches the one we started with?  That would be
-            // more robust to switching the ordering of operands, in case
-            // that ever happens...
-            llvm::Value **addSubOperandValues = 
-                (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-
-            for (int operand = 0; operand <= 1; ++operand) {
-                // Go through the vector elements and grab the operand'th
-                // one if this is an add or the v
-                for (int i = 0; i < vectorLength; ++i) {
-                    llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
-                    if (bop->getOpcode() == llvm::Instruction::Add ||
-                        bop->getOpcode() == llvm::Instruction::Sub)
-                        addSubOperandValues[i] = bop->getOperand(operand);
-                    else
-                        // The other guys are adds or subtracts, so we'll
-                        // treat this as an "add 0" in the below, so just
-                        // grab the value v[i] itself
-                        addSubOperandValues[i] = v[i];
-                }
-
-                if (lVectorValuesAllEqual(addSubOperandValues, vectorLength) && 
-                    (anyAdd || operand == 1)) {
-                    // If this operand's values are all equal, then the
-                    // overall result is an ascending linear sequence if
-                    // the other operand's values are themselves a linear
-                    // sequence and if either this is an add or we're
-                    // looking at the 2nd operand.  i.e.:
-                    //
-                    // unif + programIndex -> ascending linear sequence
-                    // programIndex + unif -> ascending linear seqeuence
-                    // programIndex - unif -> ascending linear seqeuence
-                    // unif - programIndex -> *descending* linear seqeuence
-                    //
-                    // We don't match the descending case for now; at some
-                    // future point we could generate code for that as a
-                    // vector load + shuffle.
-                    int otherOperand = operand ^ 1;
-                    for (int i = 0; i < vectorLength; ++i) {
-                        llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
-                        if (bop->getOpcode() == llvm::Instruction::Add ||
-                            bop->getOpcode() == llvm::Instruction::Sub)
-                            addSubOperandValues[i] = bop->getOperand(otherOperand);
-                        else
-                            addSubOperandValues[i] = LLVMInt32(0);
-                    }
-                    return lVectorIsLinear(addSubOperandValues, vectorLength, stride, seenPhis);
-                }
-            }
-        }
-        else if (bop->getOpcode() == llvm::Instruction::Mul) {
-            // Finally, if we have a multiply, then if one of the operands
-            // has the same value for all elements and if the other operand
-            // is a linear sequence such that the scale times the sequence
-            // values is a linear sequence with the desired stride, then
-            // we're good.
-            llvm::ConstantInt *op0 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(0));
-            llvm::ConstantInt *op1 = llvm::dyn_cast<llvm::ConstantInt>(bop->getOperand(1));
-
-            // We need one of them to be a constant for us to be able to proceed...
-            if (!op0 && !op1)
-                return false;
-            // But if they're both constants, then the LLVM constant folder
-            // should have simplified them down to their product!
-            assert(!(op0 && op1));
-
-            // Figure out which operand number is the constant scale and
-            // which is the varying one
-            int scaleOperand, otherOperand;
-            llvm::ConstantInt *scaleValue;
-            if (op0 != NULL) {
-                scaleOperand = 0;
-                otherOperand = 1;
-                scaleValue = op0;
-            }
-            else {
-                scaleOperand = 1;
-                otherOperand = 0;
-                scaleValue = op1;
-            }
-
-            // Find the scale value; make sure it evenly divides the
-            // stride.  Otherwise there's no chance that the scale times a
-            // set of integer values will give a sequence with the desired
-            // stride.
-            int mulScale = (int)scaleValue->getZExtValue();
-            if (mulScale == 0 || (stride % mulScale) != 0)
-                return false;
-
-            llvm::Value **otherValue = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-            for (int i = 0; i < vectorLength; ++i) {
-                llvm::BinaryOperator *eltBop = llvm::dyn_cast<llvm::BinaryOperator>(v[i]);
-                // Give up if it's not matching the desired pattern of "all
-                // mul ops with the scaleOperand being a constant with the
-                // same value".
-                if (!eltBop || eltBop->getOpcode() != llvm::Instruction::Mul)
-                    return false;
-                if (eltBop->getOperand(scaleOperand) != scaleValue)
-                    return false;
-
-                otherValue[i] = eltBop->getOperand(otherOperand);
-            }
-            // Now see if the sequence of values being scaled gives us
-            // something with the desired stride.
-            return lVectorIsLinear(otherValue, vectorLength, stride / mulScale, seenPhis);
-        }
     }
 
-    if (llvm::dyn_cast<llvm::PHINode>(v[0]) != NULL) {
-        int found = 0;
-        // If all of them have made it back to a phi node we've seen
-        // before, then we're good.
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
-            assert(phi != NULL);
-            if (seenPhis.find(phi) != seenPhis.end())
-                ++found;
-        }
-        assert(found == 0 || found == vectorLength);
-        if (found == vectorLength)
-            return true;
+    llvm::CastInst *ci = llvm::dyn_cast<llvm::CastInst>(v);
+    if (ci != NULL)
+        return lVectorIsLinear(ci->getOperand(0), vectorLength,
+                               stride, seenPhis);
 
-        // Otherwise record that we've seen these guys before.
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
-            seenPhis.insert(phi);
-        }
+    if (llvm::isa<llvm::CallInst>(v) || llvm::isa<llvm::LoadInst>(v))
+        return false;
+
+    llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v);
+    if (phi != NULL) {
+        for (unsigned int i = 0; i < seenPhis.size(); ++i)
+            if (seenPhis[i] == phi)
+                return true;
+
+        seenPhis.push_back(phi);
 
-        llvm::Value **vin = (llvm::Value **)alloca(vectorLength * sizeof(llvm::Value *));
-        llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[0]);
         unsigned int numIncoming = phi->getNumIncomingValues();
-        llvm::BasicBlock *bb = NULL;
-        bool anyFailure = false;
-        // Check each incoming value; if all of them are linear, then success.
+        // Check all of the incoming values: if all of them are all equal,
+        // then we're good.
         for (unsigned int i = 0; i < numIncoming; ++i) {
-            for (int j = 0; j < vectorLength; ++j) {
-                llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[j]);
-                assert(phi != NULL);
-                vin[j] = phi->getIncomingValue(i);
-                
-                if (j == 0)
-                    bb = phi->getIncomingBlock(i);
-                else
-                    assert(bb == phi->getIncomingBlock(i));
-            }
-
-            if (!lVectorIsLinear(vin, vectorLength, stride, seenPhis)) {
-                // Don't return false immediately, since we need to remove
-                // the PHINode *s from v[] from seenPhis before we return.
-                anyFailure = true;
-                break;
+            if (!lVectorIsLinear(phi->getIncomingValue(i), vectorLength, stride,
+                                 seenPhis)) {
+                seenPhis.pop_back();
+                return false;
             }
         }
 
-        for (int i = 0; i < vectorLength; ++i) {
-            llvm::PHINode *phi = llvm::dyn_cast<llvm::PHINode>(v[i]);
-            seenPhis.erase(phi);
-        }
-
-        return !anyFailure;
+        seenPhis.pop_back();
+        return true;
     }
 
+    // TODO: is any reason to worry about these?
+    if (llvm::isa<llvm::InsertElementInst>(v))
+        return false;
+
+    // TODO: we could also handle shuffles, but we haven't yet seen any
+    // cases where doing so would detect cases where actually have a linear
+    // vector.
+    llvm::ShuffleVectorInst *shuffle = llvm::dyn_cast<llvm::ShuffleVectorInst>(v);
+    if (shuffle != NULL)
+        return false;
+
+#if 0
+    fprintf(stderr, "linear check: ");
+    v->dump();
+    fprintf(stderr, "\n");
+    llvm::Instruction *inst = llvm::dyn_cast<llvm::Instruction>(v);
+    if (inst) {
+        inst->getParent()->dump();
+        fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
+    }
+#endif
+
     return false;
 }
 
@@ -2245,23 +1702,39 @@ struct ScatterImpInfo {
 bool
 GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
     GatherImpInfo gInfo[] = {
-        GatherImpInfo("__pseudo_gather_base_offsets_8", "__load_and_broadcast_8",
+        GatherImpInfo("__pseudo_gather_base_offsets32_8", "__load_and_broadcast_8",
                       "__load_masked_8", 1),
-        GatherImpInfo("__pseudo_gather_base_offsets_16", "__load_and_broadcast_16",
+        GatherImpInfo("__pseudo_gather_base_offsets32_16", "__load_and_broadcast_16",
                       "__load_masked_16", 2),
-        GatherImpInfo("__pseudo_gather_base_offsets_32", "__load_and_broadcast_32",
+        GatherImpInfo("__pseudo_gather_base_offsets32_32", "__load_and_broadcast_32",
                       "__load_masked_32", 4),
-        GatherImpInfo("__pseudo_gather_base_offsets_64", "__load_and_broadcast_64",
+        GatherImpInfo("__pseudo_gather_base_offsets32_64", "__load_and_broadcast_64",
+                      "__load_masked_64", 8),
+        GatherImpInfo("__pseudo_gather_base_offsets64_8", "__load_and_broadcast_8",
+                      "__load_masked_8", 1),
+        GatherImpInfo("__pseudo_gather_base_offsets64_16", "__load_and_broadcast_16",
+                      "__load_masked_16", 2),
+        GatherImpInfo("__pseudo_gather_base_offsets64_32", "__load_and_broadcast_32",
+                      "__load_masked_32", 4),
+        GatherImpInfo("__pseudo_gather_base_offsets64_64", "__load_and_broadcast_64",
                       "__load_masked_64", 8)
     };
     ScatterImpInfo sInfo[] = {
-        ScatterImpInfo("__pseudo_scatter_base_offsets_8",  "__pseudo_masked_store_8", 
+        ScatterImpInfo("__pseudo_scatter_base_offsets32_8",  "__pseudo_masked_store_8", 
                        LLVMTypes::Int8VectorPointerType, 1),
-        ScatterImpInfo("__pseudo_scatter_base_offsets_16", "__pseudo_masked_store_16",
+        ScatterImpInfo("__pseudo_scatter_base_offsets32_16", "__pseudo_masked_store_16",
                        LLVMTypes::Int16VectorPointerType, 2),
-        ScatterImpInfo("__pseudo_scatter_base_offsets_32", "__pseudo_masked_store_32",
+        ScatterImpInfo("__pseudo_scatter_base_offsets32_32", "__pseudo_masked_store_32",
                        LLVMTypes::Int32VectorPointerType, 4),
-        ScatterImpInfo("__pseudo_scatter_base_offsets_64", "__pseudo_masked_store_64",
+        ScatterImpInfo("__pseudo_scatter_base_offsets32_64", "__pseudo_masked_store_64",
+                       LLVMTypes::Int64VectorPointerType, 8),
+        ScatterImpInfo("__pseudo_scatter_base_offsets64_8",  "__pseudo_masked_store_8", 
+                       LLVMTypes::Int8VectorPointerType, 1),
+        ScatterImpInfo("__pseudo_scatter_base_offsets64_16", "__pseudo_masked_store_16",
+                       LLVMTypes::Int16VectorPointerType, 2),
+        ScatterImpInfo("__pseudo_scatter_base_offsets64_32", "__pseudo_masked_store_32",
+                       LLVMTypes::Int32VectorPointerType, 4),
+        ScatterImpInfo("__pseudo_scatter_base_offsets64_64", "__pseudo_masked_store_64",
                        LLVMTypes::Int64VectorPointerType, 8)
     };
 
@@ -2298,50 +1771,27 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         bool ok = lGetSourcePosFromMetadata(callInst, &pos);
         assert(ok);     
 
-        // Get the actual base pointer; note that it comes into the gather
-        // or scatter function bitcast to an i8 *, so we need to work back
-        // to get the pointer as the original type.
         llvm::Value *base = callInst->getArgOperand(0);
-        llvm::BitCastInst *bci = llvm::dyn_cast<llvm::BitCastInst>(base);
-        if (bci)
-            base = bci->getOperand(0);
-        llvm::ConstantExpr *ce = llvm::dyn_cast<llvm::ConstantExpr>(base);
-        if (ce && ce->getOpcode() == llvm::Instruction::BitCast)
-            base = ce->getOperand(0);
-
-        // Try to find out the offsets; the i'th element of the
-        // offsetElements array should be an i32 with the value of the
-        // offset for the i'th vector lane.  This may fail; if so, just
-        // give up.
-        llvm::Value *vecValue = callInst->getArgOperand(1);
-        LLVM_TYPE_CONST llvm::VectorType *vt = 
-            llvm::dyn_cast<llvm::VectorType>(vecValue->getType());
-        assert(vt != NULL);
-        int vecLength = vt->getNumElements();
-        assert(vecLength == g->target.vectorWidth);
-        llvm::Value *offsetElements[ISPC_MAX_NVEC];
-        std::map<llvm::PHINode *, llvm::Value **> phiMap;
-        if (!lScalarizeVector(vecValue, offsetElements, vecLength, phiMap))
-            continue;
-
+        llvm::Value *offsets = callInst->getArgOperand(1);
         llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 2 : 3);
 
-        if (lVectorValuesAllEqual(offsetElements, g->target.vectorWidth)) {
+        {
+        std::vector<llvm::PHINode *> seenPhis;
+        if (lVectorValuesAllEqual(offsets, g->target.vectorWidth, seenPhis)) {
             // If all the offsets are equal, then compute the single
             // pointer they all represent based on the first one of them
             // (arbitrarily).
-            llvm::Value *indices[1] = { offsetElements[0] };
-            llvm::Value *basei8 =
-                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType,
-                                      "base2i8", callInst);
-            lCopyMetadata(basei8, callInst);
+            llvm::Value *firstOffset = 
+                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
+                                                 callInst);
+            llvm::Value *indices[1] = { firstOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
             llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(basei8, arrayRef, "ptr", callInst);
+                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
 #else
             llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
+                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
                                                 "ptr", callInst);
 #endif
             lCopyMetadata(ptr, callInst);
@@ -2392,27 +1842,28 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             modifiedAny = true;
             goto restart;
         }
+        }
 
         int step = gatherInfo ? gatherInfo->align : scatterInfo->align;
-        std::set<llvm::PHINode *> seenPhis;
-        if (lVectorIsLinear(offsetElements, g->target.vectorWidth, step, seenPhis)) {
+        std::vector<llvm::PHINode *> seenPhis;
+        if (lVectorIsLinear(offsets, g->target.vectorWidth, step, seenPhis)) {
             // We have a linear sequence of memory locations being accessed
             // starting with the location given by the offset from
             // offsetElements[0], with stride of 4 or 8 bytes (for 32 bit
             // and 64 bit gather/scatters, respectively.)
 
             // Get the base pointer using the first guy's offset.
-            llvm::Value *indices[1] = { offsetElements[0] };
-            llvm::Value *basei8 =
-                new llvm::BitCastInst(base, LLVMTypes::VoidPointerType, "base2i8", callInst);
-            lCopyMetadata(basei8, callInst);
+            llvm::Value *firstOffset = 
+                llvm::ExtractElementInst::Create(offsets, LLVMInt32(0), "first_offset",
+                                                 callInst);
+            llvm::Value *indices[1] = { firstOffset };
 #if defined(LLVM_3_0) || defined(LLVM_3_0svn) || defined(LLVM_3_1svn)
             llvm::ArrayRef<llvm::Value *> arrayRef(&indices[0], &indices[1]);
             llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(basei8, arrayRef, "ptr", callInst);
+                llvm::GetElementPtrInst::Create(base, arrayRef, "ptr", callInst);
 #else
             llvm::Value *ptr = 
-                llvm::GetElementPtrInst::Create(basei8, &indices[0], &indices[1],
+                llvm::GetElementPtrInst::Create(base, &indices[0], &indices[1],
                                                 "ptr", callInst);
 #endif
             lCopyMetadata(ptr, callInst);
@@ -2457,11 +1908,6 @@ GSImprovementsPass::runOnBasicBlock(llvm::BasicBlock &bb) {
             modifiedAny = true;
             goto restart;
         }
-
-#if 0
-        lPrintVector("scatter/gather no love: flattened", offsetElements);
-        bb.dump();
-#endif
     }
 
     return modifiedAny;
@@ -2512,17 +1958,49 @@ struct LowerGSInfo {
 bool
 LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
     LowerGSInfo lgsInfo[] = {
-        LowerGSInfo("__pseudo_gather_base_offsets_8",  "__gather_base_offsets_i8",  true),
-        LowerGSInfo("__pseudo_gather_base_offsets_16", "__gather_base_offsets_i16", true),
-        LowerGSInfo("__pseudo_gather_base_offsets_32", "__gather_base_offsets_i32", true),
-        LowerGSInfo("__pseudo_gather_base_offsets_64", "__gather_base_offsets_i64", true),
-        LowerGSInfo("__pseudo_scatter_base_offsets_8",  "__scatter_base_offsets_i8",  false),
-        LowerGSInfo("__pseudo_scatter_base_offsets_16", "__scatter_base_offsets_i16", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets_32", "__scatter_base_offsets_i32", false),
-        LowerGSInfo("__pseudo_scatter_base_offsets_64", "__scatter_base_offsets_i64", false)
+        LowerGSInfo("__pseudo_gather_base_offsets32_8",  "__gather_base_offsets32_i8",  true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_16", "__gather_base_offsets32_i16", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_32", "__gather_base_offsets32_i32", true),
+        LowerGSInfo("__pseudo_gather_base_offsets32_64", "__gather_base_offsets32_i64", true),
+
+        LowerGSInfo("__pseudo_gather_base_offsets64_8",  "__gather_base_offsets64_i8",  true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_16", "__gather_base_offsets64_i16", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_32", "__gather_base_offsets64_i32", true),
+        LowerGSInfo("__pseudo_gather_base_offsets64_64", "__gather_base_offsets64_i64", true),
+
+        LowerGSInfo("__pseudo_gather32_8",  "__gather32_i8",  true),
+        LowerGSInfo("__pseudo_gather32_16", "__gather32_i16", true),
+        LowerGSInfo("__pseudo_gather32_32", "__gather32_i32", true),
+        LowerGSInfo("__pseudo_gather32_64", "__gather32_i64", true),
+
+        LowerGSInfo("__pseudo_gather64_8",  "__gather64_i8",  true),
+        LowerGSInfo("__pseudo_gather64_16", "__gather64_i16", true),
+        LowerGSInfo("__pseudo_gather64_32", "__gather64_i32", true),
+        LowerGSInfo("__pseudo_gather64_64", "__gather64_i64", true),
+
+        LowerGSInfo("__pseudo_scatter_base_offsets32_8",  "__scatter_base_offsets32_i8",  false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_16", "__scatter_base_offsets32_i16", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_32", "__scatter_base_offsets32_i32", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets32_64", "__scatter_base_offsets32_i64", false),
+
+        LowerGSInfo("__pseudo_scatter_base_offsets64_8",  "__scatter_base_offsets64_i8",  false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_16", "__scatter_base_offsets64_i16", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_32", "__scatter_base_offsets64_i32", false),
+        LowerGSInfo("__pseudo_scatter_base_offsets64_64", "__scatter_base_offsets64_i64", false),
+
+        LowerGSInfo("__pseudo_scatter32_8",  "__scatter32_i8",  false),
+        LowerGSInfo("__pseudo_scatter32_16", "__scatter32_i16", false),
+        LowerGSInfo("__pseudo_scatter32_32", "__scatter32_i32", false),
+        LowerGSInfo("__pseudo_scatter32_64", "__scatter32_i64", false),
+
+        LowerGSInfo("__pseudo_scatter64_8",  "__scatter64_i8",  false),
+        LowerGSInfo("__pseudo_scatter64_16", "__scatter64_i16", false),
+        LowerGSInfo("__pseudo_scatter64_32", "__scatter64_i32", false),
+        LowerGSInfo("__pseudo_scatter64_64", "__scatter64_i64", false),
     };
 
     bool modifiedAny = false;
+
  restart:
     for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
         // Loop over the instructions and find calls to the
@@ -2556,6 +2034,7 @@ LowerGSPass::runOnBasicBlock(llvm::BasicBlock &bb) {
         modifiedAny = true;
         goto restart;
     }
+
     return modifiedAny;
 }
 
@@ -2701,10 +2180,18 @@ bool
 MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
     const char *names[] = {
         "__fast_masked_vload", 
-        "__gather_base_offsets_i8", "__gather_base_offsets_i16",
-        "__gather_base_offsets_i32", "__gather_base_offsets_i64",
-        "__gather_elt_i8", "__gather_elt_i16", 
-        "__gather_elt_i32", "__gather_elt_i64", 
+        "__gather_base_offsets32_i8", "__gather_base_offsets32_i16",
+        "__gather_base_offsets32_i32", "__gather_base_offsets32_i64",
+        "__gather_base_offsets64_i8", "__gather_base_offsets64_i16",
+        "__gather_base_offsets64_i32", "__gather_base_offsets64_i64",
+        "__gather32_i8", "__gather32_i16",
+        "__gather32_i32", "__gather32_i64",
+        "__gather64_i8", "__gather64_i16",
+        "__gather64_i32", "__gather64_i64",
+        "__gather_elt32_i8", "__gather_elt32_i16", 
+        "__gather_elt32_i32", "__gather_elt32_i64", 
+        "__gather_elt64_i8", "__gather_elt64_i16", 
+        "__gather_elt64_i32", "__gather_elt64_i64", 
         "__load_and_broadcast_8", "__load_and_broadcast_16",
         "__load_and_broadcast_32", "__load_and_broadcast_64",
         "__load_masked_8", "__load_masked_16",
@@ -2713,10 +2200,18 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) {
         "__masked_store_32", "__masked_store_64",
         "__masked_store_blend_8", "__masked_store_blend_16",
         "__masked_store_blend_32", "__masked_store_blend_64",
-        "__scatter_base_offsets_i8", "__scatter_base_offsets_i16",
-        "__scatter_base_offsets_i32", "__scatter_base_offsets_i64",
-        "__scatter_elt_i8", "__scatter_elt_i16", 
-        "__scatter_elt_i32", "__scatter_elt_i64", 
+        "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16",
+        "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64",
+        "__scatter_base_offsets64_i8", "__scatter_base_offsets64_i16",
+        "__scatter_base_offsets64_i32", "__scatter_base_offsets64_i64",
+        "__scatter_elt32_i8", "__scatter_elt32_i16", 
+        "__scatter_elt32_i32", "__scatter_elt32_i64", 
+        "__scatter_elt64_i8", "__scatter_elt64_i16", 
+        "__scatter_elt64_i32", "__scatter_elt64_i64", 
+        "__scatter32_i8", "__scatter32_i16",
+        "__scatter32_i32", "__scatter32_i64",
+        "__scatter64_i8", "__scatter64_i16",
+        "__scatter64_i32", "__scatter64_i64",
     };
 
     bool modifiedAny = false;
diff --git a/parse.yy b/parse.yy
index f742e72b..6d4a6d78 100644
--- a/parse.yy
+++ b/parse.yy
@@ -104,15 +104,15 @@ static const char *lBuiltinTokens[] = {
     "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
     "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
     "inline", "int", "int8", "int16", "int32", "int64", "launch", "NULL",
-    "print", "reference", "return",
+    "print", "return", "sizeof",
     "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
     "unsigned", "varying", "void", "while", NULL 
 };
 
 static const char *lParamListTokens[] = {
     "bool", "const", "double", "enum", "false", "float", "int",
-    "int8", "int16", "int32", "int64", "reference", "struct", "true",
-     "uniform", "unsigned", "varying", "void", NULL 
+    "int8", "int16", "int32", "int64", "struct", "true",
+    "uniform", "unsigned", "varying", "void", NULL 
 };
     
 %}
@@ -152,12 +152,13 @@ static const char *lParamListTokens[] = {
 %token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN 
 %token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN 
 %token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
+%token TOKEN_SIZEOF
 
 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
 %token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
 %token TOKEN_INT8 TOKEN_INT16 TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
-%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE
+%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE
 
 %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
 %token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH
@@ -183,7 +184,8 @@ static const char *lParamListTokens[] = {
 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
 %type <declarationList> parameter_list parameter_type_list
-%type <declarator> declarator pointer init_declarator direct_declarator struct_declarator
+%type <declarator> declarator pointer reference
+%type <declarator> init_declarator direct_declarator struct_declarator
 %type <declarator> abstract_declarator direct_abstract_declarator
 
 %type <structDeclaratorList> struct_declarator_list
@@ -289,10 +291,9 @@ postfix_expression
       { $$ = new FunctionCallExpr($1, $3, Union(@1,@4)); }
     | launch_expression
     | postfix_expression '.' TOKEN_IDENTIFIER
-      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3); }
-/*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
-      { UNIMPLEMENTED }
-*/
+      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3, false); }
+    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
+      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3, true); }
     | postfix_expression TOKEN_INC_OP
       { $$ = new UnaryExpr(UnaryExpr::PostInc, $1, Union(@1,@2)); }
     | postfix_expression TOKEN_DEC_OP
@@ -317,6 +318,10 @@ unary_expression
       { $$ = new UnaryExpr(UnaryExpr::PreInc, $2, Union(@1, @2)); }
     | TOKEN_DEC_OP unary_expression   
       { $$ = new UnaryExpr(UnaryExpr::PreDec, $2, Union(@1, @2)); }
+    | '&' unary_expression
+      { $$ = new AddressOfExpr($2, Union(@1, @2)); }
+    | '*' unary_expression
+      { $$ = new DereferenceExpr($2, Union(@1, @2)); }
     | '+' cast_expression 
       { $$ = $2; }
     | '-' cast_expression 
@@ -325,6 +330,10 @@ unary_expression
       { $$ = new UnaryExpr(UnaryExpr::BitNot, $2, Union(@1, @2)); }
     | '!' cast_expression 
       { $$ = new UnaryExpr(UnaryExpr::LogicalNot, $2, Union(@1, @2)); }
+    | TOKEN_SIZEOF unary_expression
+      { $$ = new SizeOfExpr($2, Union(@1, @2)); }
+    | TOKEN_SIZEOF '(' type_name ')'
+      { $$ = new SizeOfExpr($3, Union(@1, @4)); }
     ;
 
 cast_expression
@@ -711,8 +720,6 @@ specifier_qualifier_list
                 $$ = $2->GetAsUniformType();
             else if ($1 == TYPEQUAL_VARYING)
                 $$ = $2->GetAsVaryingType();
-            else if ($1 == TYPEQUAL_REFERENCE)
-                $$ = new ReferenceType($2, false);
             else if ($1 == TYPEQUAL_CONST)
                 $$ = $2->GetAsConstType();
             else if ($1 == TYPEQUAL_UNSIGNED) {
@@ -860,7 +867,6 @@ type_qualifier
     | TOKEN_VARYING    { $$ = TYPEQUAL_VARYING; }
     | TOKEN_TASK       { $$ = TYPEQUAL_TASK; }
     | TOKEN_INLINE     { $$ = TYPEQUAL_INLINE; }
-    | TOKEN_REFERENCE  { $$ = TYPEQUAL_REFERENCE; }
     | TOKEN_UNSIGNED   { $$ = TYPEQUAL_UNSIGNED; }
     ;
 
@@ -884,6 +890,14 @@ declarator
         tail->child = $2;
         $$ = $1;
     }
+    | reference direct_declarator
+    {
+        Declarator *tail = $1;
+        while (tail->child != NULL)
+           tail = tail->child;
+        tail->child = $2;
+        $$ = $1;
+    }
     | direct_declarator
     ;
 
@@ -930,7 +944,7 @@ direct_declarator
           if ($1 != NULL) {
               Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
               d->child = $1;
-              d->functionArgs = *$3;
+              if ($3 != NULL) d->functionParams = *$3;
               $$ = d;
           }
           else
@@ -976,6 +990,14 @@ pointer
     ;
 
 
+reference
+    : '&' 
+    {
+        $$ = new Declarator(DK_REFERENCE, @1); 
+    }
+    ;
+
+
 parameter_type_list
     : parameter_list { $$ = $1; }
     ;
@@ -1067,6 +1089,17 @@ abstract_declarator
           d->child = $2;
           $$ = d;
       }
+    | reference
+      {
+          Declarator *d = new Declarator(DK_REFERENCE, @1);
+          $$ = d;
+      }
+    | reference direct_abstract_declarator
+      {
+          Declarator *d = new Declarator(DK_REFERENCE, Union(@1, @2));
+          d->child = $2;
+          $$ = d;
+      }
     ;
 
 direct_abstract_declarator
@@ -1113,7 +1146,7 @@ direct_abstract_declarator
     | '(' parameter_type_list ')'
       {
           Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @3));
-          d->functionArgs = *$2;
+          if ($2 != NULL) d->functionParams = *$2;
       }
     | direct_abstract_declarator '(' ')'
       {
@@ -1125,7 +1158,7 @@ direct_abstract_declarator
       {
           Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
           d->child = $1;
-          d->functionArgs = *$3;
+          if ($3 != NULL) d->functionParams = *$3;
           $$ = d;
       }
     ;
@@ -1370,10 +1403,10 @@ function_definition
     } 
     compound_statement
     {
-        Symbol *sym;
         std::vector<Symbol *> args;
-        $2->GetFunctionInfo($1, &sym, &args);
-        m->AddFunctionDefinition(sym, args, $4);
+        Symbol *sym = $2->GetFunctionInfo($1, &args);
+        if (sym != NULL)
+            m->AddFunctionDefinition(sym, args, $4);
         m->symbolTable->PopScope(); // push in lAddFunctionParams();
     }
 /* function with no declared return type??
@@ -1397,25 +1430,24 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
 
     if (ds->storageClass == SC_TYPEDEF)
         m->AddTypeDef(decl->GetSymbol());
-    else if (decl->kind == DK_FUNCTION) {
-        // function declaration
+    else {
         const Type *t = decl->GetType(ds);
         if (t == NULL)
             return;
         const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
-        assert(ft != NULL);
+        if (ft != NULL) {
+            Symbol *funSym = decl->GetSymbol();
+            assert(funSym != NULL);
+            funSym->type = ft;
+            funSym->storageClass = ds->storageClass;
 
-        Symbol *funSym = decl->GetSymbol();
-        assert(funSym != NULL);
-        funSym->type = ft;
-        funSym->storageClass = ds->storageClass;
-
-        bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
-        m->AddFunctionDeclaration(funSym, isInline);
+            bool isInline = (ds->typeQualifiers & TYPEQUAL_INLINE);
+            m->AddFunctionDeclaration(funSym, isInline);
+        }
+        else
+            m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
+                                 (ds->typeQualifiers & TYPEQUAL_CONST) != 0);
     }
-    else
-        m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
-                             (ds->typeQualifiers & TYPEQUAL_CONST) != 0);
 }
 
 
@@ -1426,9 +1458,14 @@ static void
 lAddFunctionParams(Declarator *decl) {
     m->symbolTable->PushScope();
 
-    // wire up arguments
-    for (unsigned int i = 0; i < decl->functionArgs.size(); ++i) {
-        Declaration *pdecl = decl->functionArgs[i];
+    // walk down to the declarator for the function itself 
+    while (decl->kind != DK_FUNCTION && decl->child != NULL)
+        decl = decl->child;
+    assert(decl->kind == DK_FUNCTION);
+
+    // now loop over its parameters and add them to the symbol table
+    for (unsigned int i = 0; i < decl->functionParams.size(); ++i) {
+        Declaration *pdecl = decl->functionParams[i];
         if (pdecl == NULL)
             continue;
         assert(pdecl->declarators.size() == 1);
diff --git a/run_tests.py b/run_tests.py
index 9a6b7283..7bc7ebdf 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -40,7 +40,8 @@ parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
 # if no specific test files are specified, run all of the tests in tests/
 # and failing_tests/
 if len(args) == 0:
-    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
+    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
+        glob.glob("tests_errors/*ispc")
 else:
     files = args
 
diff --git a/stdlib.ispc b/stdlib.ispc
index ebe146be..c27dead7 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -319,85 +319,89 @@ static inline uniform int lanemask() {
 // AOS/SOA conversion
 
 static inline void
-aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
-            reference float v1, reference float v2) {
-    __aos_to_soa3_float(a, offset, v0, v1, v2);
+aos_to_soa3(uniform float a[], uniform int offset, float * uniform v0,
+            float * uniform v1, float * uniform v2) {
+    __aos_to_soa3_float(&a[0], offset, v0, v1, v2);
 }
 
 static inline void
 soa_to_aos3(float v0, float v1, float v2, uniform float a[], 
             uniform int offset) {
-    __soa_to_aos3_float(v0, v1, v2, a, offset);
+    __soa_to_aos3_float(v0, v1, v2, &a[0], offset);
 }
 
 static inline void
-aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
-            reference float v1, reference float v2, reference float v3) {
-    __aos_to_soa4_float(a, offset, v0, v1, v2, v3);
+aos_to_soa4(uniform float a[], uniform int offset, float * uniform v0,
+            float * uniform v1, float * uniform v2, float * uniform v3) {
+    __aos_to_soa4_float(&a[0], offset, v0, v1, v2, v3);
 }
 
 static inline void
 soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], 
             uniform int offset) {
-    __soa_to_aos4_float(v0, v1, v2, v3, a, offset);
+    __soa_to_aos4_float(v0, v1, v2, v3, &a[0], offset);
 }
 
 static inline void
-aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
-            reference int32 v1, reference int32 v2) {
-    __aos_to_soa3_int32(a, offset, v0, v1, v2);
+aos_to_soa3(uniform int32 a[], uniform int offset, int32 * uniform v0,
+            int32 * uniform v1, int32 * uniform v2) {
+    __aos_to_soa3_int32(&a[0], offset, v0, v1, v2);
 }
 
 static inline void
 soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], 
             uniform int offset) {
-    __soa_to_aos3_int32(v0, v1, v2, a, offset);
+    __soa_to_aos3_int32(v0, v1, v2, &a[0], offset);
 }
 
 static inline void
-aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
-            reference int32 v1, reference int32 v2, reference int32 v3) {
-    __aos_to_soa4_int32(a, offset, v0, v1, v2, v3);
+aos_to_soa4(uniform int32 a[], uniform int offset, int32 * uniform v0,
+            int32 * uniform v1, int32 * uniform v2, int32 * uniform v3) {
+    __aos_to_soa4_int32(&a[0], offset, v0, v1, v2, v3);
 }
 
 static inline void
 soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], 
             uniform int offset) {
-    __soa_to_aos4_int32(v0, v1, v2, v3, a, offset);
+    __soa_to_aos4_int32(v0, v1, v2, v3, &a[0], offset);
 }
 
 ///////////////////////////////////////////////////////////////////////////
 // Prefetching
 
-#define PREFETCHES(NAME, TYPE)                                  \
-static inline void prefetch_l1(const reference TYPE ptr) {      \
-    __prefetch_read_1_##NAME##_refsconst(ptr);                  \
-}                                                               \
-static inline void prefetch_l2(const reference TYPE ptr) {      \
-    __prefetch_read_2_##NAME##_refsconst(ptr);                  \
-}                                                               \
-static inline void prefetch_l3(const reference TYPE ptr) {      \
-    __prefetch_read_3_##NAME##_refsconst(ptr);                  \
-}                                                               \
- static inline void prefetch_nt(const reference TYPE ptr) {     \
-     __prefetch_read_nt_##NAME##_refsconst(ptr);                \
+static inline void prefetch_l1(const void * uniform ptr) {
+    __prefetch_read_uniform_1((uniform int8 * uniform)ptr);
 }
 
-PREFETCHES(uniform_int8, uniform int8)
-PREFETCHES(uniform_int16, uniform int16)
-PREFETCHES(uniform_int32, uniform int32)
-PREFETCHES(uniform_int64, uniform int64)
-PREFETCHES(uniform_float, uniform float)
-PREFETCHES(uniform_double, uniform double)
+static inline void prefetch_l2(const void * uniform ptr) {
+    __prefetch_read_uniform_2((uniform int8 * uniform)ptr);
+}
 
-PREFETCHES(varying_int8, int8)
-PREFETCHES(varying_int16, int16)
-PREFETCHES(varying_int32, int32)
-PREFETCHES(varying_int64, int64)
-PREFETCHES(varying_float, float)
-PREFETCHES(varying_double, double)
+static inline void prefetch_l3(const void * uniform ptr) {
+    __prefetch_read_uniform_3((uniform int8 * uniform)ptr);
+}
 
-#undef PREFETCHES
+static inline void prefetch_nt(const void * uniform ptr) {
+     __prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
+}
+
+#if 0
+static inline void prefetch_l1(const void * varying ptr) {
+    __prefetch_read_varying_1((varying int8 * varying)ptr);
+}
+
+static inline void prefetch_l2(const void * varying ptr) {
+    __prefetch_read_varying_2((varying int8 * varying)ptr);
+}
+
+static inline void prefetch_l3(const void * varying ptr) {
+    __prefetch_read_varying_3((varying int8 * varying)ptr);
+}
+
+static inline void prefetch_nt(const void * varying ptr) {
+     __prefetch_read_varying_nt((varying int8 * varying)ptr);
+}
+#endif
 
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
@@ -525,9 +529,9 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
 #define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE)                     \
 static inline uniform bool reduce_equal(TYPE v) {                  \
     uniform TYPE unusedValue;                                      \
-    return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
+    return __reduce_equal_##FUNCTYPE(v, &unusedValue, (MASKTYPE)__mask); \
 }                                                                  \
-static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
+static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { \
     return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }
 
@@ -599,26 +603,26 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 
 static inline uniform int 
 packed_load_active(uniform unsigned int a[], uniform int start,
-                   reference unsigned int vals) {
-    return __packed_load_active(a, (unsigned int)start, vals,
+                   unsigned int * uniform vals) {
+    return __packed_load_active(&a[0], (unsigned int)start, vals,
                                 (unsigned int32)__mask);
 }
 
 static inline uniform int
 packed_store_active(uniform unsigned int a[], uniform int start,
                     unsigned int vals) {
-    return __packed_store_active(a, (unsigned int)start, vals,
+    return __packed_store_active(&a[0], (unsigned int)start, vals,
                                  (unsigned int32)__mask);
 }
 
 static inline uniform int packed_load_active(uniform int a[], uniform int start,
-                                             reference int vals) {
-    return __packed_load_active(a, start, vals, (int32)__mask);
+                                             int * uniform vals) {
+    return __packed_load_active(&a[0], start, vals, (int32)__mask);
 }
 
 static inline uniform int packed_store_active(uniform int a[], uniform int start,
                                               int vals) {
-    return __packed_store_active(a, start, vals, (int32)__mask);
+    return __packed_store_active(&a[0], start, vals, (int32)__mask);
 }
 
 ///////////////////////////////////////////////////////////////////////////
@@ -636,35 +640,35 @@ static inline void memory_barrier() {
 }
 
 #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
-static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
     memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }
 
 #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
-static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
     uniform TA oneval = reduce_##OPA(value);                            \
     TA ret;                                                             \
     if (lanemask() != 0) {                                              \
         memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \
         memory_barrier();                                               \
     }                                                                   \
     return ret;                                                         \
 }                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                                uniform TA value) {      \
     memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }
@@ -717,16 +721,16 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
 
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
-         uniform reference TA ref, TA oldval, TA newval) {                 \
+         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
     memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
     memory_barrier();                                                      \
     return ret;                                                            \
 } \
 static inline uniform TA atomic_compare_exchange_global(               \
-         uniform reference TA ref, uniform TA oldval, uniform TA newval) {                 \
+         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) {                 \
     memory_barrier();                                                   \
-    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
     memory_barrier();                                                   \
     return ret;                                                         \
 }
@@ -1162,22 +1166,22 @@ static inline uniform float ldexp(uniform float x, uniform int n) {
     return floatbits(ix);
 }
 
-static inline float frexp(float x, reference int pw2) {
+static inline float frexp(float x, int * uniform pw2) {
     unsigned int ex = 0x7F800000u;              // exponent mask
     unsigned int ix = intbits(x);
     ex &= ix;
     ix &= ~0x7F800000u;  // clear exponent
-    pw2 = (int)(ex >> 23) - 126; // compute exponent
+    *pw2 = (int)(ex >> 23) - 126; // compute exponent
     ix |= 0x3F000000u;         // insert exponent +1 in x
     return floatbits(ix);
 }
 
-static inline uniform float frexp(uniform float x, reference uniform int pw2) {
+static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
     uniform unsigned int ex = 0x7F800000u;              // exponent mask
     uniform unsigned int ix = intbits(x);
     ex &= ix;
     ix &= ~0x7F800000u;  // clear exponent
-    pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
+    *pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
     ix |= 0x3F000000u;         // insert exponent +1 in x
     return floatbits(ix);
 }
@@ -1441,7 +1445,8 @@ static inline uniform float cos(uniform float x_full) {
 }
 
 
-static inline void sincos(float x_full, reference float sin_result, reference float cos_result) {
+static inline void sincos(float x_full, float * uniform sin_result, 
+                          float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
         __svml_sincos(x_full, sin_result, cos_result);
     }
@@ -1451,9 +1456,9 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
             if ((mask & (1 << i)) == 0)
                 continue;
             uniform float s, c;
-            __stdlib_sincosf(extract(x_full, i), s, c);
-            sin_result = insert(sin_result, i, s);
-            cos_result = insert(cos_result, i, c);
+            __stdlib_sincosf(extract(x_full, i), &s, &c);
+            *sin_result = insert(*sin_result, i, s);
+            *cos_result = insert(*cos_result, i, c);
         }
     }
     else if (__math_lib == __math_lib_ispc || 
@@ -1503,17 +1508,17 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
 
         sin_formula *= x;
 
-        sin_result = sin_usecos ? cos_formula : sin_formula;
-        cos_result = cos_usecos ? cos_formula : sin_formula;
+        *sin_result = sin_usecos ? cos_formula : sin_formula;
+        *cos_result = cos_usecos ? cos_formula : sin_formula;
 
-        sin_result = sin_flipsign ? -sin_result : sin_result;
-        cos_result = cos_flipsign ? -cos_result : cos_result;
+        *sin_result = sin_flipsign ? -*sin_result : *sin_result;
+        *cos_result = cos_flipsign ? -*cos_result : *cos_result;
     }
 }
 
 
-static inline void sincos(uniform float x_full, reference uniform float sin_result,
-                          reference uniform float cos_result) {
+static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
+                          uniform float * uniform cos_result) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
         __stdlib_sincosf(x_full, sin_result, cos_result);
@@ -1565,11 +1570,11 @@ static inline void sincos(uniform float x_full, reference uniform float sin_resu
 
         sin_formula *= x;
 
-        sin_result = sin_usecos ? cos_formula : sin_formula;
-        cos_result = cos_usecos ? cos_formula : sin_formula;
+        *sin_result = sin_usecos ? cos_formula : sin_formula;
+        *cos_result = cos_usecos ? cos_formula : sin_formula;
 
-        sin_result = sin_flipsign ? -sin_result : sin_result;
-        cos_result = cos_flipsign ? -cos_result : cos_result;
+        *sin_result = sin_flipsign ? -*sin_result : *sin_result;
+        *cos_result = cos_flipsign ? -*cos_result : *cos_result;
     }
 }
 
@@ -2038,7 +2043,8 @@ static inline uniform float exp(uniform float x_full) {
 // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
 // * log(2) + log(y) where y is the reduced range (usually in [1/2,
 // 1)).
-static inline void __range_reduce_log(float input, reference float reduced, reference int exponent) {
+static inline void __range_reduce_log(float input, float * uniform reduced, 
+                                      int * uniform exponent) {
     int int_version = intbits(input);
     // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
     // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
@@ -2057,28 +2063,28 @@ static inline void __range_reduce_log(float input, reference float reduced, refe
     int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
 
     int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
-    exponent = offset_exponent - 127; // get the real value
+    *exponent = offset_exponent - 127; // get the real value
 
     // Blend the offset_exponent with the original input (do this in
     // int for now, until I decide if float can have & and &not)
     int blended = (int_version & nonexponent_mask) | (exponent_neg1);
-    reduced = floatbits(blended);
+    *reduced = floatbits(blended);
 }
 
 
 
-static inline void __range_reduce_log(uniform float input, reference uniform float reduced, 
-                                      reference uniform int exponent) {
+static inline void __range_reduce_log(uniform float input, uniform float * uniform reduced, 
+                                      uniform int * uniform exponent) {
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
     static const uniform int exponent_neg1 = (126 << 23);
     uniform int biased_exponent = int_version >> 23;
     uniform int offset_exponent = biased_exponent + 1;
-    exponent = offset_exponent - 127; // get the real value
+    *exponent = offset_exponent - 127; // get the real value
 
     uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
-    reduced = floatbits(blended);
+    *reduced = floatbits(blended);
 }
 
 
@@ -2099,7 +2105,7 @@ static inline float log(float x_full) {
     }
     else if (__math_lib == __math_lib_ispc_fast) {
         int e;
-        x_full = frexp(x_full, e);
+        x_full = frexp(x_full, &e);
     
         int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
         e += x_smaller_SQRTHF;
@@ -2139,7 +2145,7 @@ static inline float log(float x_full) {
         const float one = 1.0;
 
         float patched = exceptional ? one : x_full;
-        __range_reduce_log(patched, reduced, exponent);
+        __range_reduce_log(patched, &reduced, &exponent);
 
         const float ln2 = 0.693147182464599609375;
 
@@ -2179,7 +2185,7 @@ static inline uniform float log(uniform float x_full) {
     }
     else if (__math_lib == __math_lib_ispc_fast) {
         uniform int e;
-        x_full = frexp(x_full, e);
+        x_full = frexp(x_full, &e);
     
         uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
         e += x_smaller_SQRTHF;
@@ -2219,7 +2225,7 @@ static inline uniform float log(uniform float x_full) {
         const uniform float one = 1.0;
 
         uniform float patched = exceptional ? one : x_full;
-        __range_reduce_log(patched, reduced, exponent);
+        __range_reduce_log(patched, &reduced, &exponent);
 
         const uniform float ln2 = 0.693147182464599609375;
 
@@ -2315,22 +2321,22 @@ static inline uniform double ldexp(uniform double x, uniform int n) {
     return doublebits(ix);
 }
 
-static inline double frexp(double x, reference int pw2) {
+static inline double frexp(double x, int * uniform pw2) {
     unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
     unsigned int64 ix = intbits(x);
     ex &= ix;
     ix &= ~0x7ff0000000000000;  // clear exponent
-    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    *pw2 = (int)(ex >> 52) - 1022; // compute exponent
     ix |= 0x3fe0000000000000;         // insert exponent +1 in x
     return doublebits(ix);
 }
 
-static inline uniform double frexp(uniform double x, reference uniform int pw2) {
+static inline uniform double frexp(uniform double x, uniform int * uniform pw2) {
     uniform unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
     uniform unsigned int64 ix = intbits(x);
     ex &= ix;
     ix &= ~0x7ff0000000000000;  // clear exponent
-    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    *pw2 = (int)(ex >> 52) - 1022; // compute exponent
     ix |= 0x3fe0000000000000;         // insert exponent +1 in x
     return doublebits(ix);
 }
@@ -2381,13 +2387,13 @@ static inline uniform double cos(uniform double x) {
         return __stdlib_cos(x);
 }
 
-static inline void sincos(double x, reference double sin_result,
-                          reference double cos_result) {
+static inline void sincos(double x, double * uniform sin_result,
+                          double * uniform cos_result) {
     if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
-        sincos((float)x, sr, cr);
-        sin_result = sr;
-        cos_result = cr;
+        sincos((float)x, &sr, &cr);
+        *sin_result = sr;
+        *cos_result = cr;
     }
     else {
         uniform int mask = lanemask();
@@ -2395,20 +2401,20 @@ static inline void sincos(double x, reference double sin_result,
             uniform double sr, cr;
             if ((mask & (1 << i)) == 0)
                 continue;
-            __stdlib_sincos(extract(x, i), sr, cr);
-            sin_result = insert(sin_result, i, sr);
-            cos_result = insert(cos_result, i, cr);
+            __stdlib_sincos(extract(x, i), &sr, &cr);
+            *sin_result = insert(*sin_result, i, sr);
+            *cos_result = insert(*cos_result, i, cr);
         }
     }
 }
 
-static inline void sincos(uniform double x, reference uniform double sin_result,
-                          reference uniform double cos_result) {
+static inline void sincos(uniform double x, uniform double * uniform sin_result,
+                          uniform double * uniform cos_result) {
     if (__math_lib == __math_lib_ispc_fast) {
         uniform float sr, cr;
-        sincos((uniform float)x, sr, cr);
-        sin_result = sr;
-        cos_result = cr;
+        sincos((uniform float)x, &sr, &cr);
+        *sin_result = sr;
+        *cos_result = cr;
     }
     else
         __stdlib_sincos(x, sin_result, cos_result);
@@ -2883,63 +2889,64 @@ struct RNGState {
     unsigned int z1, z2, z3, z4;
 };
 
-static inline unsigned int random(reference RNGState state)
+static inline unsigned int random(RNGState * uniform state)
 {
     unsigned int b;
 
-    b  = ((state.z1 << 6) ^ state.z1) >> 13;
-    state.z1 = ((state.z1 & 4294967294U) << 18) ^ b;
-    b  = ((state.z2 << 2) ^ state.z2) >> 27; 
-    state.z2 = ((state.z2 & 4294967288U) << 2) ^ b;
-    b  = ((state.z3 << 13) ^ state.z3) >> 21;
-    state.z3 = ((state.z3 & 4294967280U) << 7) ^ b;
-    b  = ((state.z4 << 3) ^ state.z4) >> 12;
-    state.z4 = ((state.z4 & 4294967168U) << 13) ^ b;
-    return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4);
+    // FIXME: state->z1, etc..
+    b  = (((*state).z1 << 6) ^ (*state).z1) >> 13;
+    (*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
+    b  = (((*state).z2 << 2) ^ (*state).z2) >> 27; 
+    (*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
+    b  = (((*state).z3 << 13) ^ (*state).z3) >> 21;
+    (*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
+    b  = (((*state).z4 << 3) ^ (*state).z4) >> 12;
+    (*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
+    return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
 }
 
-static inline float frandom(reference RNGState state)
+static inline float frandom(RNGState * uniform state)
 {
     unsigned int irand = random(state);
     irand &= (1<<23)-1;
     return floatbits(0x3F800000 | irand)-1.0f;
 }
 
-static inline uniform unsigned int __seed4(reference RNGState state, 
+static inline uniform unsigned int __seed4(RNGState * uniform state, 
                                            uniform int start,
                                            uniform unsigned int seed) {
     uniform unsigned int c1 = 0xf0f0f0f0;
     uniform unsigned int c2 = 0x0f0f0f0f;
 
-    state.z1 = insert(state.z1, start + 0, seed);
-    state.z1 = insert(state.z1, start + 1, seed ^ c1);
-    state.z1 = insert(state.z1, start + 2, (seed << 3) ^ c1);
-    state.z1 = insert(state.z1, start + 3, (seed << 2) ^ c2);
+    (*state).z1 = insert((*state).z1, start + 0, seed);
+    (*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
+    (*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
+    (*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
 
     seed += 131;
-    state.z2 = insert(state.z2, start + 0, seed);
-    state.z2 = insert(state.z2, start + 1, seed ^ c1);
-    state.z2 = insert(state.z2, start + 2, (seed << 3) ^ c1);
-    state.z2 = insert(state.z2, start + 3, (seed << 2) ^ c2);
+    (*state).z2 = insert((*state).z2, start + 0, seed);
+    (*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
+    (*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
+    (*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
 
-    seed ^= extract(state.z2, 2);
-    state.z3 = insert(state.z3, start + 0, seed);
-    state.z3 = insert(state.z3, start + 1, seed ^ c1);
-    state.z3 = insert(state.z3, start + 2, (seed << 3) ^ c1);
-    state.z3 = insert(state.z3, start + 3, (seed << 2) ^ c2);
+    seed ^= extract((*state).z2, 2);
+    (*state).z3 = insert((*state).z3, start + 0, seed);
+    (*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
+    (*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
+    (*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
 
     seed <<= 4;
     seed += 3;
-    seed ^= extract(state.z1, 3);
-    state.z4 = insert(state.z4, start + 0, seed);
-    state.z4 = insert(state.z4, start + 1, seed ^ c1);
-    state.z4 = insert(state.z4, start + 2, (seed << 3) ^ c1);
-    state.z4 = insert(state.z4, start + 3, (seed << 2) ^ c2);
+    seed ^= extract((*state).z1, 3);
+    (*state).z4 = insert((*state).z4, start + 0, seed);
+    (*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
+    (*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
+    (*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
 
     return seed;
 }
 
-static inline void seed_rng(reference uniform RNGState state, uniform unsigned int seed) {
+static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
     seed = __seed4(state, 0, seed);
     if (programCount == 8)
         __seed4(state, 4, seed ^ 0xbeeff00d);
diff --git a/stmt.cpp b/stmt.cpp
index 4c934982..8f49b9a5 100644
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -131,7 +131,11 @@ lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
         // which in turn may represent an overloaded function.  So we need
         // to try to resolve the overload based on the type of the symbol
         // we're initializing here.
-        if (fse->ResolveOverloads(funcType->GetArgumentTypes()) == false)
+        std::vector<const Type *> paramTypes;
+        for (int i = 0; i < funcType->GetNumParameters(); ++i)
+            paramTypes.push_back(funcType->GetParameterType(i));
+
+        if (fse->ResolveOverloads(paramTypes) == false)
             return false;
     }
     return true;
@@ -151,14 +155,9 @@ lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
 static void
 lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *symType,
             Expr *initExpr, FunctionEmitContext *ctx, SourcePos pos) {
-    if (initExpr == NULL) {
-        // Initialize things without initializers to the undefined value.
-        // To auto-initialize everything to zero, replace 'UndefValue' with
-        // 'NullValue' in the below
-        LLVM_TYPE_CONST llvm::Type *ltype = symType->LLVMType(g->ctx);
-        ctx->StoreInst(llvm::UndefValue::get(ltype), lvalue);
+    if (initExpr == NULL)
+        // leave it uninitialized
         return;
-    }
 
     // If the initializer is a straight up expression that isn't an
     // ExprList, then we'll see if we can type convert it to the type of
@@ -239,7 +238,14 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *symType,
             // Initialize each element with the corresponding value from
             // the ExprList
             for (int i = 0; i < nInits; ++i) {
-                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "element");
+                llvm::Value *ep;
+                if (dynamic_cast<const StructType *>(symType) != NULL)
+                    ep = ctx->AddElementOffset(lvalue, i, NULL, "element");
+                else
+                    ep = ctx->GetElementPtrInst(lvalue, LLVMInt32(0), LLVMInt32(i), 
+                                                PointerType::GetUniform(collectionType->GetElementType(i)), 
+                                                "gep");
+
                 lInitSymbol(ep, symName, collectionType->GetElementType(i), 
                             exprList->exprs[i], ctx, pos);
             }
@@ -359,9 +365,11 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
         else {
             // For non-static variables, allocate storage on the stack
             sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
+
             // Tell the FunctionEmitContext about the variable; must do
             // this before the initializer stuff.
             ctx->EmitVariableDebugInfo(sym);
+
             // And then get it initialized...
             sym->parentFunction = ctx->GetFunction();
             lInitSymbol(sym->storagePtr, sym->name.c_str(), sym->type, 
@@ -693,16 +701,22 @@ lSafeToRunWithAllLanesOff(Expr *expr) {
         // If we can determine at compile time the size of the array/vector
         // and if the indices are compile-time constants, then we may be
         // able to safely run this under a predicated if statement..
-        if (ie->arrayOrVector == NULL)
+        if (ie->baseExpr == NULL)
             return false;
 
-        const Type *type = ie->arrayOrVector->GetType();
+        const Type *type = ie->baseExpr->GetType();
         ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
         if (type == NULL || ce == NULL)
             return false;
         if (dynamic_cast<const ReferenceType *>(type) != NULL)
             type = type->GetReferenceTarget();
 
+        const PointerType *pointerType = 
+            dynamic_cast<const PointerType *>(type);
+        if (pointerType != NULL)
+            // pointer[offset] -> can't be sure
+            return false;
+
         const SequentialType *seqType = 
             dynamic_cast<const SequentialType *>(type);
         assert(seqType != NULL);
@@ -740,6 +754,14 @@ lSafeToRunWithAllLanesOff(Expr *expr) {
     if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
         return lSafeToRunWithAllLanesOff(dre->expr);
 
+    SizeOfExpr *soe;
+    if ((soe = dynamic_cast<SizeOfExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(soe->expr);
+
+    AddressOfExpr *aoe;
+    if ((aoe = dynamic_cast<AddressOfExpr *>(expr)) != NULL)
+        return lSafeToRunWithAllLanesOff(aoe->expr);
+
     if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
         dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
         dynamic_cast<SyncExpr *>(expr) != NULL ||
@@ -1822,7 +1844,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
                 if (!ptr)
                     return;
 
-                llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, i);
+                llvm::Value *arrayPtr = ctx->AddElementOffset(argPtrArray, i, NULL);
                 ctx->StoreInst(ptr, arrayPtr);
             }
         }
@@ -1830,7 +1852,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
             llvm::Value *ptr = lProcessPrintArg(values, ctx, argTypes);
             if (!ptr)
                 return;
-            llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, 0);
+            llvm::Value *arrayPtr = ctx->AddElementOffset(argPtrArray, 0, NULL);
             ctx->StoreInst(ptr, arrayPtr);
         }
     }
@@ -1846,7 +1868,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
     args[2] = LLVMInt32(g->target.vectorWidth);
     args[3] = ctx->LaneMask(mask);
     std::vector<llvm::Value *> argVec(&args[0], &args[5]);
-    ctx->CallInst(printFunc, AtomicType::Void, argVec, "");
+    ctx->CallInst(printFunc, NULL, argVec, "");
 }
 
 
@@ -1926,7 +1948,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
     args.push_back(ctx->GetStringPtr(errorString));
     args.push_back(expr->GetValue(ctx));
     args.push_back(ctx->GetFullMask());
-    ctx->CallInst(assertFunc, AtomicType::Void, args, "");
+    ctx->CallInst(assertFunc, NULL, args, "");
 
 #ifndef ISPC_IS_WINDOWS
     free(errorString);
diff --git a/test_static.cpp b/test_static.cpp
index 06960fd2..9deb1a9f 100644
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -101,7 +101,8 @@ int main(int argc, char *argv[]) {
     assert(w <= 16);
 
     float returned_result[16];
-    memset(returned_result, 0, 16*sizeof(float));
+    for (int i = 0; i < 16; ++i)
+        returned_result[i] = -1e20;
     float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
     double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
     int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
diff --git a/tests/aossoa-1.ispc b/tests/aossoa-1.ispc
index 0500f4d9..f1db1fec 100644
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
         a[i] = i;
 
     float x=-1, y=-1, z=-1;
-    aos_to_soa3(a, 0, x, y, z);
+    aos_to_soa3(a, 0, &x, &y, &z);
 
     int errs = 0;
     if (x != width * programIndex) ++errs;
diff --git a/tests/aossoa-2.ispc b/tests/aossoa-2.ispc
index b5bc84ee..6615bbcf 100644
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
         a[i] = i;
 
     float x=-1, y=-1, z=-1, w=-1;
-    aos_to_soa4(a, 0, x, y, z, w);
+    aos_to_soa4(a, 0, &x, &y, &z, &w);
 
     int errs = 0;
     if (x != width * programIndex) ++errs;
diff --git a/tests/aossoa-5.ispc b/tests/aossoa-5.ispc
index 4cadb502..461b8e76 100644
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
         a[i] = i;
 
     int x=-1, y=-1, z=-1;
-    aos_to_soa3(a, 0, x, y, z);
+    aos_to_soa3(a, 0, &x, &y, &z);
 
     int errs = 0;
     if (x != width * programIndex) ++errs;
diff --git a/tests/aossoa-6.ispc b/tests/aossoa-6.ispc
index 52269ba2..fd71ed11 100644
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
         a[i] = i;
 
     int x=-1, y=-1, z=-1, w=-1;
-    aos_to_soa4(a, 0, x, y, z, w);
+    aos_to_soa4(a, 0, &x, &y, &z, &w);
 
     int errs = 0;
     if (x != width * programIndex) ++errs;
diff --git a/tests/array-assignment-varying-control.ispc b/tests/array-assignment-varying-control.ispc
index 458a8cd2..53014a72 100644
--- a/tests/array-assignment-varying-control.ispc
+++ b/tests/array-assignment-varying-control.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 
 struct Foo { float f; };
 
-void f(reference uniform Foo foo[], float a) {
+void f(uniform Foo foo[], float a) {
     ++foo[a].f;
 }
 
diff --git a/tests/atomics-1.ispc b/tests/atomics-1.ispc
index abe04e55..84921e5e 100644
--- a/tests/atomics-1.ispc
+++ b/tests/atomics-1.ispc
@@ -6,7 +6,7 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float delta = 1;
-    float b = atomic_add_global(s, delta);
+    float b = atomic_add_global(&s, delta);
     RET[programIndex] = reduce_add(b);
 }
 
diff --git a/tests/atomics-10.ispc b/tests/atomics-10.ispc
index c033b0bf..1e941f81 100644
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float b = 0;
     float delta = 1;
     if (programIndex < 2)
-        b = atomic_add_global(s, delta);
+        b = atomic_add_global(&s, delta);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-11.ispc b/tests/atomics-11.ispc
index cb94544c..1fd17ecb 100644
--- a/tests/atomics-11.ispc
+++ b/tests/atomics-11.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex & 1)
-        b = atomic_add_global(s, programIndex);
+        b = atomic_add_global(&s, programIndex);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-12.ispc b/tests/atomics-12.ispc
index 4d7e2c1e..0596a85f 100644
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-13.ispc b/tests/atomics-13.ispc
index 5ca59c30..71413ed0 100644
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
     RET[programIndex] = popcnt(reduce_max((int32)b));
 }
 
diff --git a/tests/atomics-14.ispc b/tests/atomics-14.ispc
index cf9826cb..c4551039 100644
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float b = 0;
     if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
     RET[programIndex] = (s>>20);
 }
 
diff --git a/tests/atomics-2.ispc b/tests/atomics-2.ispc
index 9af4281c..a73e6025 100644
--- a/tests/atomics-2.ispc
+++ b/tests/atomics-2.ispc
@@ -6,7 +6,7 @@ uniform int64 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     float delta = 1;
-    float b = atomic_add_global(s, delta);
+    float b = atomic_add_global(&s, delta);
     RET[programIndex] = reduce_add(b);
 }
 
diff --git a/tests/atomics-3.ispc b/tests/atomics-3.ispc
index 9b68a90b..ef3085ae 100644
--- a/tests/atomics-3.ispc
+++ b/tests/atomics-3.ispc
@@ -6,7 +6,7 @@ uniform int32 s = 0xff;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
     int32 bits = 0xfffffff0;
-    float b = atomic_xor_global(s, bits);
+    float b = atomic_xor_global(&s, bits);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-4.ispc b/tests/atomics-4.ispc
index 4a0ea6dc..83e9fbf0 100644
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_or_global(s, (1<<programIndex));
+    float b = atomic_or_global(&s, (1<<programIndex));
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-5.ispc b/tests/atomics-5.ispc
index 0941497c..d1171148 100644
--- a/tests/atomics-5.ispc
+++ b/tests/atomics-5.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0xbeef;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_swap_global(s, programIndex);
+    float b = atomic_swap_global(&s, programIndex);
     RET[programIndex] = reduce_max(b);
 }
 
diff --git a/tests/atomics-6.ispc b/tests/atomics-6.ispc
index c84baed9..778c5d01 100644
--- a/tests/atomics-6.ispc
+++ b/tests/atomics-6.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 2;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    float b = atomic_compare_exchange_global(s, programIndex, a*1000);
+    float b = atomic_compare_exchange_global(&s, programIndex, a*1000);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-7.ispc b/tests/atomics-7.ispc
index ec34ba8c..d3f41901 100644
--- a/tests/atomics-7.ispc
+++ b/tests/atomics-7.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     int32 a = aFOO[programIndex]; 
-    float b = atomic_min_global(s, a);
+    float b = atomic_min_global(&s, a);
     RET[programIndex] = reduce_min(b);
 }
 
diff --git a/tests/atomics-8.ispc b/tests/atomics-8.ispc
index 93dda6ac..c81a7838 100644
--- a/tests/atomics-8.ispc
+++ b/tests/atomics-8.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     int32 a = aFOO[programIndex]; 
     int32 b = 0;
     if (programIndex & 1)
-        b = atomic_max_global(s, a);
+        b = atomic_max_global(&s, a);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-9.ispc b/tests/atomics-9.ispc
index 1d7ff5ec..52396ad1 100644
--- a/tests/atomics-9.ispc
+++ b/tests/atomics-9.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     float b = 0;
     int32 delta = 1;
     if (programIndex < 2)
-        b = atomic_add_global(s, delta);
+        b = atomic_add_global(&s, delta);
     RET[programIndex] = reduce_add(b);
 }
 
diff --git a/tests/atomics-uniform-1.ispc b/tests/atomics-uniform-1.ispc
index 8455deb9..c84d806c 100644
--- a/tests/atomics-uniform-1.ispc
+++ b/tests/atomics-uniform-1.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 10;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_add_global(s, 1);
+    uniform unsigned int32 b = atomic_add_global(&s, 1);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-uniform-2.ispc b/tests/atomics-uniform-2.ispc
index b878d430..67fd95a8 100644
--- a/tests/atomics-uniform-2.ispc
+++ b/tests/atomics-uniform-2.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0b1010;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_or_global(s, 1);
+    uniform unsigned int32 b = atomic_or_global(&s, 1);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-uniform-3.ispc b/tests/atomics-uniform-3.ispc
index 5b16b249..4d1b2d47 100644
--- a/tests/atomics-uniform-3.ispc
+++ b/tests/atomics-uniform-3.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0b1010;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_or_global(s, 1);
+    uniform unsigned int32 b = atomic_or_global(&s, 1);
     RET[programIndex] = b;
 }
 
diff --git a/tests/atomics-uniform-4.ispc b/tests/atomics-uniform-4.ispc
index 2e400faa..7104fcda 100644
--- a/tests/atomics-uniform-4.ispc
+++ b/tests/atomics-uniform-4.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0xffff;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_min_global(s, 1);
+    uniform unsigned int32 b = atomic_min_global(&s, 1);
     RET[programIndex] = b;
 }
 
diff --git a/tests/atomics-uniform-5.ispc b/tests/atomics-uniform-5.ispc
index ac0b849e..8a43c5f7 100644
--- a/tests/atomics-uniform-5.ispc
+++ b/tests/atomics-uniform-5.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0xffff;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_min_global(s, 1);
+    uniform unsigned int32 b = atomic_min_global(&s, 1);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-uniform-6.ispc b/tests/atomics-uniform-6.ispc
index 4161cd65..ab843023 100644
--- a/tests/atomics-uniform-6.ispc
+++ b/tests/atomics-uniform-6.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform float b = atomic_swap_global(s, 1.);
+    uniform float b = atomic_swap_global(&s, 1.);
     RET[programIndex] = s;
 }
 
diff --git a/tests/atomics-uniform-7.ispc b/tests/atomics-uniform-7.ispc
index a7d3816b..3f145824 100644
--- a/tests/atomics-uniform-7.ispc
+++ b/tests/atomics-uniform-7.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform float b = atomic_swap_global(s, 1.);
+    uniform float b = atomic_swap_global(&s, 1.);
     RET[programIndex] = b;
 }
 
diff --git a/tests/atomics-uniform-8.ispc b/tests/atomics-uniform-8.ispc
index a8f89cc5..94124604 100644
--- a/tests/atomics-uniform-8.ispc
+++ b/tests/atomics-uniform-8.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform float b = atomic_compare_exchange_global(s, 1., -100.);
+    uniform float b = atomic_compare_exchange_global(&s, 1., -100.);
     RET[programIndex] = b;
 }
 
diff --git a/tests/atomics-uniform-9.ispc b/tests/atomics-uniform-9.ispc
index ce632f5c..3b7cff36 100644
--- a/tests/atomics-uniform-9.ispc
+++ b/tests/atomics-uniform-9.ispc
@@ -5,7 +5,7 @@ uniform int64 s = 100.;
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     float a = aFOO[programIndex]; 
-    uniform int64 b = atomic_compare_exchange_global(s, 100, -100);
+    uniform int64 b = atomic_compare_exchange_global(&s, 100, -100);
     RET[programIndex] = s;
 }
 
diff --git a/tests/cfor-ref-5.ispc b/tests/cfor-ref-5.ispc
index 545b4987..39e0f817 100644
--- a/tests/cfor-ref-5.ispc
+++ b/tests/cfor-ref-5.ispc
@@ -3,8 +3,8 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a) {
-    a = 0;
+void foo(float * uniform a) {
+    *a = 0;
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     uniform int i;
     cfor (i = 0; i < 10; ++i)
         x[i] = a*b;
-    foo(x[b]);
+    foo(&x[b]);
     RET[programIndex] = x[5] + x[9];
 }
 
diff --git a/tests/cfor-ref-6.ispc b/tests/cfor-ref-6.ispc
index 4baf40ba..109b6568 100644
--- a/tests/cfor-ref-6.ispc
+++ b/tests/cfor-ref-6.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a[10]) {
+void foo(float a[10]) {
     a[5] = 0;
 }
 
diff --git a/tests/cfor-ref-7.ispc b/tests/cfor-ref-7.ispc
index 929300bb..e8aedc09 100644
--- a/tests/cfor-ref-7.ispc
+++ b/tests/cfor-ref-7.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a[10]) {
+void foo(float a[10]) {
     a[5] = 0;
 }
 
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
index 96890dd9..db6f742e 100644
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     if (programIndex & 1)
         a = -a;
     int exponent;
-    frexp(a, exponent);
+    frexp(a, &exponent);
     RET[programIndex] = exponent;
 }
 
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
index 1c95bd28..e2728197 100644
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     if (programIndex & 1)
         a = -a;
     int exponent;
-    RET[programIndex] = frexp(a, exponent);
+    RET[programIndex] = frexp(a, &exponent);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
index ae59bbf5..0d971086 100644
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     if (programIndex & 1)
         a = -a;
     int exponent;
-    frexp(a, exponent);
+    frexp(a, &exponent);
     RET[programIndex] = exponent;
 }
 
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
index bebba0f4..2d248448 100644
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     if (programIndex & 1)
         a = -a;
     int exponent;
-    RET[programIndex] = frexp(a, exponent);
+    RET[programIndex] = frexp(a, &exponent);
 }
 
 export void result(uniform float RET[]) {
diff --git a/tests/funcptr-null-3.ispc b/tests/funcptr-null-3.ispc
index 02fe9549..8e228315 100644
--- a/tests/funcptr-null-3.ispc
+++ b/tests/funcptr-null-3.ispc
@@ -8,7 +8,7 @@ float foo(float a, float b) {
 }
 
 static float bar(float a, float b) {
-    return min(a, b);
+    return a < b ? a : b;
 }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
diff --git a/failing_tests/masked-scatter-vector.ispc b/tests/masked-scatter-vector.ispc
similarity index 84%
rename from failing_tests/masked-scatter-vector.ispc
rename to tests/masked-scatter-vector.ispc
index 3060b84e..adda440e 100644
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/tests/masked-scatter-vector.ispc
@@ -14,10 +14,10 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     varying int3 vv = array[a];
     ++vv.y;
     array[a] = vv;
-//CO    print("fin %\n", array[programIndex].y);
     ret[programIndex] = array[programIndex].y;
 }
 
 export void result(uniform float ret[]) {
-    ret[programIndex] = 100+programIndex;
+    ret[programIndex] = 101+programIndex;
+    ret[0] = 100;
 }
diff --git a/tests/packed-load-1.ispc b/tests/packed-load-1.ispc
index d3ce0c1d..7f645d6a 100644
--- a/tests/packed-load-1.ispc
+++ b/tests/packed-load-1.ispc
@@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     uniform unsigned int a[programCount];
     a[programIndex] = aFOO[programIndex];
     unsigned int aa;
-    packed_load_active(a, 0, aa);
+    packed_load_active(a, 0, &aa);
     RET[programIndex] = aa;
 }
 
diff --git a/tests/packed-load-2.ispc b/tests/packed-load-2.ispc
index 2a7505f7..97a3543e 100644
--- a/tests/packed-load-2.ispc
+++ b/tests/packed-load-2.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     int aa = 15;
     uniform int count = 0;
     if (programIndex < 2)
-        count += packed_load_active(a, 0, aa);
+        count += packed_load_active(a, 0, &aa);
     RET[programIndex] = aa;
 }
 
diff --git a/tests/packed-load-3.ispc b/tests/packed-load-3.ispc
index 98620457..826aab38 100644
--- a/tests/packed-load-3.ispc
+++ b/tests/packed-load-3.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     int aa;
     uniform int count = 0;
     if (programIndex < 2)
-        count += packed_load_active(a, 0, aa);
+        count += packed_load_active(a, 0, &aa);
     RET[programIndex] = count;
 }
 
diff --git a/tests/packed-load-4.ispc b/tests/packed-load-4.ispc
index 90d194ec..13e4ce11 100644
--- a/tests/packed-load-4.ispc
+++ b/tests/packed-load-4.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     int aa = 32;
     uniform int count = 0;
     if (programIndex < 2)
-        count += packed_load_active(a, 5, aa);
+        count += packed_load_active(a, 5, &aa);
     RET[programIndex] = aa;
 }
 
diff --git a/tests/packed-load-5.ispc b/tests/packed-load-5.ispc
index 990b8e3c..e26720ea 100644
--- a/tests/packed-load-5.ispc
+++ b/tests/packed-load-5.ispc
@@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
     int aa = 32;
     uniform int count = 0;
     if (programIndex & 1)
-        count += packed_load_active(a, 10, aa);
+        count += packed_load_active(a, 10, &aa);
     if (!(programIndex & 1))
-        count += packed_load_active(a, 10+count, aa);
+        count += packed_load_active(a, 10+count, &aa);
     RET[programIndex] = aa;
 }
 
diff --git a/tests/pass-varying-lvalue-to-ref.ispc b/tests/pass-varying-lvalue-to-ref.ispc
index beed8468..4089b9d9 100644
--- a/tests/pass-varying-lvalue-to-ref.ispc
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -1,14 +1,14 @@
 
 export uniform int width() { return programCount; }
 
-void inc(reference float v) { ++v; }
+void inc(uniform float * varying v) { ++(*v); }
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     uniform float foo[32];
     for (uniform int i = 0; i < 32; ++i)
         foo[i] = 10+i;
     int a = (int)aa[programIndex];
-    inc(foo[a]);
+    inc(&foo[a]);
     ret[programIndex] = foo[programIndex]-programIndex;
 }
 
diff --git a/tests/prefetch.ispc b/tests/prefetch.ispc
new file mode 100644
index 00000000..d9afa893
--- /dev/null
+++ b/tests/prefetch.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    prefetch_l1(aFOO);
+    prefetch_l2(aFOO);
+    prefetch_l3(aFOO);
+    prefetch_nt(aFOO);
+    float a = aFOO[programIndex]; 
+    float b = 0.; b = a; 
+    RET[programIndex] = a+b; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 + 2*programIndex;
+}
diff --git a/tests/ptr-1.ispc b/tests/ptr-1.ispc
new file mode 100644
index 00000000..117ed26e
--- /dev/null
+++ b/tests/ptr-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int a = 1;
+    uniform int * uniform b = &a;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/ptr-10.ispc b/tests/ptr-10.ispc
new file mode 100644
index 00000000..b9163ce8
--- /dev/null
+++ b/tests/ptr-10.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/ptr-11.ispc b/tests/ptr-11.ispc
new file mode 100644
index 00000000..5699af1b
--- /dev/null
+++ b/tests/ptr-11.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = &aFOO[0];
+    b = b + 3;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/ptr-12.ispc b/tests/ptr-12.ispc
new file mode 100644
index 00000000..e213d8b4
--- /dev/null
+++ b/tests/ptr-12.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = &aFOO[0];
+    ++b;
+    b++;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/ptr-13.ispc b/tests/ptr-13.ispc
new file mode 100644
index 00000000..51d63e3c
--- /dev/null
+++ b/tests/ptr-13.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = &aFOO[0];
+    b += 3;
+    b -= 1;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/ptr-14.ispc b/tests/ptr-14.ispc
new file mode 100644
index 00000000..f1331123
--- /dev/null
+++ b/tests/ptr-14.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex];
+    float * uniform pa = &a;
+    int * uniform pb = (int *)pa;
+    float *uniform pc = (float *)pb;
+    *pc = programIndex;
+    RET[programIndex] = *pc;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/ptr-15.ispc b/tests/ptr-15.ispc
new file mode 100644
index 00000000..0a6da12c
--- /dev/null
+++ b/tests/ptr-15.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+void foo(uniform float * uniform * ret) {
+    uniform float *px = *ret;
+    ++px;
+    *ret = px;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform ptr = &aFOO[0];
+    foo(&ptr);
+    RET[programIndex] = *ptr;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
diff --git a/tests/ptr-16.ispc b/tests/ptr-16.ispc
new file mode 100644
index 00000000..f7d37df6
--- /dev/null
+++ b/tests/ptr-16.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    RET[programIndex] = b[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/ptr-17.ispc b/tests/ptr-17.ispc
new file mode 100644
index 00000000..c0b87caf
--- /dev/null
+++ b/tests/ptr-17.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    b[programIndex] = programCount - programIndex;
+    RET[programIndex] = aFOO[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
diff --git a/tests/ptr-18.ispc b/tests/ptr-18.ispc
new file mode 100644
index 00000000..b3c701bf
--- /dev/null
+++ b/tests/ptr-18.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    b += 10;
+    RET[programIndex] = b[-5];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6;
+}
diff --git a/tests/ptr-19.ispc b/tests/ptr-19.ispc
new file mode 100644
index 00000000..e3a1db98
--- /dev/null
+++ b/tests/ptr-19.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    b += 10;
+    int8 index = -5;
+    RET[programIndex] = b[index];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6;
+}
diff --git a/tests/ptr-2.ispc b/tests/ptr-2.ispc
new file mode 100644
index 00000000..8fc7c3b1
--- /dev/null
+++ b/tests/ptr-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/ptr-20.ispc b/tests/ptr-20.ispc
new file mode 100644
index 00000000..a5da93bf
--- /dev/null
+++ b/tests/ptr-20.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    b += 10;
+    uniform int8 index = -5;
+    RET[programIndex] = b[index];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 6;
+}
diff --git a/tests/ptr-21.ispc b/tests/ptr-21.ispc
new file mode 100644
index 00000000..6d162682
--- /dev/null
+++ b/tests/ptr-21.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    int a;
+    uniform float b;
+};
+
+void update(Foo * uniform fp) {
+    fp->a += 1;
+    fp->b = 1;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Foo f = { aFOO[programIndex], 5 };
+    update(&f);
+    RET[programIndex] = f.b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1;
+}
diff --git a/tests/ptr-22.ispc b/tests/ptr-22.ispc
new file mode 100644
index 00000000..b3e203a6
--- /dev/null
+++ b/tests/ptr-22.ispc
@@ -0,0 +1,23 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    int a;
+    uniform float b;
+};
+
+void update(Foo * varying fp) {
+    ++fp;
+    fp->a -= 1;
+    fp->b = 1;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Foo f[2] = { { 1234, 4321 }, { aFOO[programIndex], 5 } };
+    update(f);
+    RET[programIndex] = f[1].a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/ptr-23.ispc b/tests/ptr-23.ispc
new file mode 100644
index 00000000..a967de47
--- /dev/null
+++ b/tests/ptr-23.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    int a;
+    uniform float b;
+};
+
+void update(float<3> * uniform vp) {
+    vp->x = 0;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float<3> v = { 1, 2, 3 };
+    update(&v);
+    RET[programIndex] = v.x;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/ptr-24.ispc b/tests/ptr-24.ispc
new file mode 100644
index 00000000..428366ea
--- /dev/null
+++ b/tests/ptr-24.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+void update(uniform float<2> * varying vp) {
+    vp->y = 0;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float<2> v[programCount];
+    for (uniform int i = 0; i < programCount; ++i) {
+        v[i].x = 2*i;
+        v[i].y = 2*i+1;
+    }
+    
+    int index = aFOO[programIndex] - 1;
+    update(&v[programIndex]);
+//CO    for (uniform int i = 0; i < programCount; ++i) 
+//CO        print("%: % %\n", i, v[i].x, v[i].y);
+    RET[programIndex] = v[programIndex].x + v[programIndex].y;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2*programIndex;
+}
diff --git a/tests/ptr-25.ispc b/tests/ptr-25.ispc
new file mode 100644
index 00000000..e471435e
--- /dev/null
+++ b/tests/ptr-25.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+void update(float<2> * varying vp) {
+    vp->y = 0;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float<2> v[programCount];
+    for (uniform int i = 0; i < programCount; ++i) {
+        v[i].x = 2*i;
+        v[i].y = 2*i+1;
+    }
+    
+    int index = aFOO[programIndex] - 1;
+    update(&v[programIndex]);
+//CO    for (uniform int i = 0; i < programCount; ++i) 
+//CO        print("%: % %\n", i, v[i].x, v[i].y);
+    RET[programIndex] = v[programIndex].x;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2*programIndex;
+}
diff --git a/tests/ptr-3.ispc b/tests/ptr-3.ispc
new file mode 100644
index 00000000..b84abeb1
--- /dev/null
+++ b/tests/ptr-3.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    *b = 2;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
diff --git a/tests/ptr-4.ispc b/tests/ptr-4.ispc
new file mode 100644
index 00000000..65038dbe
--- /dev/null
+++ b/tests/ptr-4.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    ++*b;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/ptr-5.ispc b/tests/ptr-5.ispc
new file mode 100644
index 00000000..a0cd0fdc
--- /dev/null
+++ b/tests/ptr-5.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    (*b)++;
+    RET[programIndex] = *b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2+programIndex;
+}
diff --git a/tests/ptr-6.ispc b/tests/ptr-6.ispc
new file mode 100644
index 00000000..89982534
--- /dev/null
+++ b/tests/ptr-6.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform ap = &aFOO[0];
+    RET[programIndex] = ap[programIndex];
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/ptr-7.ispc b/tests/ptr-7.ispc
new file mode 100644
index 00000000..e4095077
--- /dev/null
+++ b/tests/ptr-7.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * varying ap = &aFOO[programIndex];
+    RET[programIndex] = *ap;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/ptr-8.ispc b/tests/ptr-8.ispc
new file mode 100644
index 00000000..4444212d
--- /dev/null
+++ b/tests/ptr-8.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    if (a <= 2)
+        inc(b);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 2;
+    RET[1] = 3;
+}
diff --git a/tests/ptr-9.ispc b/tests/ptr-9.ispc
new file mode 100644
index 00000000..ba44bf34
--- /dev/null
+++ b/tests/ptr-9.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+void inc(int * uniform v) {
+    ++*v;
+}
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int a = aFOO[programIndex];
+    int * uniform b = &a;
+    void * uniform vp = b;
+    int * uniform c = (int * uniform)vp;
+    RET[programIndex] = *c;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/ptr-diff-1.ispc b/tests/ptr-diff-1.ispc
new file mode 100644
index 00000000..2cc101e8
--- /dev/null
+++ b/tests/ptr-diff-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * uniform b = aFOO;
+    b += 5;
+    RET[programIndex] = b - aFOO;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/ptr-diff-2.ispc b/tests/ptr-diff-2.ispc
new file mode 100644
index 00000000..80cb6b10
--- /dev/null
+++ b/tests/ptr-diff-2.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform float * varying b = aFOO;
+    b += 5;
+    RET[programIndex] = b - aFOO;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 5;
+}
diff --git a/tests/ptr-diff-3.ispc b/tests/ptr-diff-3.ispc
new file mode 100644
index 00000000..ee92c21d
--- /dev/null
+++ b/tests/ptr-diff-3.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 * varying pa = (uniform int8 *)aFOO;
+    uniform int8 * varying pb = (uniform int8 *)(&aFOO[10]);
+    RET[programIndex] = pb - pa;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 40;
+}
diff --git a/tests/ptr-r--0.ispc b/tests/ptr-r--0.ispc
new file mode 100644
index 00000000..2977c6cf
--- /dev/null
+++ b/tests/ptr-r--0.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    foo(&a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 0; }
diff --git a/tests/ptr-r--1.ispc b/tests/ptr-r--1.ispc
new file mode 100644
index 00000000..82a5b993
--- /dev/null
+++ b/tests/ptr-r--1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(uniform float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    foo(&b);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 0; }
diff --git a/tests/ptr-r--2.ispc b/tests/ptr-r--2.ispc
new file mode 100644
index 00000000..10d6c2cd
--- /dev/null
+++ b/tests/ptr-r--2.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+
+float foo(float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (b > 6)
+        foo(&a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 1+programIndex; }
diff --git a/tests/ptr-r--3.ispc b/tests/ptr-r--3.ispc
new file mode 100644
index 00000000..fe9bb0fe
--- /dev/null
+++ b/tests/ptr-r--3.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (a >= 3)
+        foo(&a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[4]) { 
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/ptr-r--4.ispc b/tests/ptr-r--4.ispc
new file mode 100644
index 00000000..88593dcf
--- /dev/null
+++ b/tests/ptr-r--4.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    if (a >= 3)
+        foo(&a);
+    RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 0;
+    RET[0] = 1;
+    RET[1] = 2;
+}
diff --git a/tests/ptr-r--5.ispc b/tests/ptr-r--5.ispc
new file mode 100644
index 00000000..84f48f6d
--- /dev/null
+++ b/tests/ptr-r--5.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float * uniform a) {
+    *a = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    foo(&x[b]);
+    RET[programIndex] = x[5] + x[9];
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 5 * (programIndex+1); }
diff --git a/tests/ptr-r--6.ispc b/tests/ptr-r--6.ispc
new file mode 100644
index 00000000..41569a5e
--- /dev/null
+++ b/tests/ptr-r--6.ispc
@@ -0,0 +1,26 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a >= 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 10;
+}
+
diff --git a/tests/ptr-r--7.ispc b/tests/ptr-r--7.ispc
new file mode 100644
index 00000000..57787c58
--- /dev/null
+++ b/tests/ptr-r--7.ispc
@@ -0,0 +1,25 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float a[10]) {
+    a[5] = 0;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex];
+    float x[10];
+    uniform int i;
+    for (i = 0; i < 10; ++i)
+        x[i] = a*b;
+    if (a > 2)
+        foo(x);
+    RET[programIndex] = x[b] + x[9];
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 5 * (programIndex+1);
+    RET[0] = 10;
+    RET[1] = 20;
+}
diff --git a/tests/ptr-r-erence-assignment-typeconv.ispc b/tests/ptr-r-erence-assignment-typeconv.ispc
new file mode 100644
index 00000000..4ce6ee8e
--- /dev/null
+++ b/tests/ptr-r-erence-assignment-typeconv.ispc
@@ -0,0 +1,19 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float * uniform x, int y) {
+    *x = y;
+}
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    float aa = a[programIndex];
+    int bb = (int)b;
+    foo(&aa, bb);
+    ret[programIndex] = aa;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 5;
+}
diff --git a/tests/reference-assignment-struct-unsized-array-as-ptr.ispc b/tests/ptr-r-erence-assignment.ispc
similarity index 59%
rename from tests/reference-assignment-struct-unsized-array-as-ptr.ispc
rename to tests/ptr-r-erence-assignment.ispc
index 6d9ba99f..96e2bc95 100644
--- a/tests/reference-assignment-struct-unsized-array-as-ptr.ispc
+++ b/tests/ptr-r-erence-assignment.ispc
@@ -3,14 +3,14 @@ export uniform int width() { return programCount; }
 
 
 
-struct Foo {
-    uniform float array[];
-};
+void foo(float * uniform x, float y) {
+    *x = y;
+}
 
 export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
-    Foo f;
-    f.array = ret;
-    f.array[programIndex] = b;
+    float aa = a[programIndex], bb = b;
+    foo(&aa, bb);
+    ret[programIndex] = aa;
 }
 
 export void result(uniform float r[]) {
diff --git a/tests/ptr-r-erence-prepost-increment.ispc b/tests/ptr-r-erence-prepost-increment.ispc
new file mode 100644
index 00000000..02a6fcb4
--- /dev/null
+++ b/tests/ptr-r-erence-prepost-increment.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+void foo(float * uniform x) {
+    if ((*x) <= 2)
+        ++(*x);
+}
+
+export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
+    float aa = a[programIndex];
+    foo(&aa);
+    ret[programIndex] = aa;
+}
+
+export void result(uniform float r[]) {
+    r[programIndex] = 1+programIndex;
+    r[0] = 2;
+    r[1] = 3;
+}
diff --git a/tests/rand-distrib-1.ispc b/tests/rand-distrib-1.ispc
index 63be8c40..a53cef12 100644
--- a/tests/rand-distrib-1.ispc
+++ b/tests/rand-distrib-1.ispc
@@ -3,13 +3,13 @@ export uniform int width() { return programCount; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     RNGState state;
-    seed_rng(state, 1);
+    seed_rng(&state, 1);
     int count[32];
     for (uniform int i = 0; i < 32; ++i)
         count[i] = (b == 5.) ? 0 : 1;
     uniform int iters = 10000;
     for (uniform int i = 0; i < iters; ++i) {
-        unsigned int val = random(state);
+        unsigned int val = random(&state);
         for (uniform int j = 0; j < 32; ++j) {
             if (val & (1<<j))
                 ++count[j];
diff --git a/tests/rand-distrib.ispc b/tests/rand-distrib.ispc
index 9fe32753..edf24f2b 100644
--- a/tests/rand-distrib.ispc
+++ b/tests/rand-distrib.ispc
@@ -3,11 +3,11 @@ export uniform int width() { return programCount; }
 
 export void f_f(uniform float RET[], uniform float aFOO[]) {
     RNGState state;
-    seed_rng(state, 1);
+    seed_rng(&state, 1);
     float sum = 0;
     uniform int iters = 40000;
     for (unsigned int i = 0; i < iters; ++i)
-        sum += frandom(state);
+        sum += frandom(&state);
     float avg = sum / iters;
     RET[programIndex] = (avg > 0.495 && avg < 0.505) ? 1 : 0;
 }
diff --git a/tests/reduce-equal-12.ispc b/tests/reduce-equal-12.ispc
index 8b67e5ce..9f1b1737 100644
--- a/tests/reduce-equal-12.ispc
+++ b/tests/reduce-equal-12.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     uniform int sameVal;
     uniform bool re;
     if (a <= 11) {
-        re = reduce_equal(a, sameVal);
+        re = reduce_equal(a, &sameVal);
 //CO        print("% % %\n", re, sameVal, a);
     }
     RET[programIndex] = ((int)re << 8) + sameVal;
diff --git a/tests/reduce-equal-13.ispc b/tests/reduce-equal-13.ispc
index 15eac013..e37d597e 100644
--- a/tests/reduce-equal-13.ispc
+++ b/tests/reduce-equal-13.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     uniform bool re;
     uniform int val;
     if (programIndex & 1) {
-        re = reduce_equal(a, val);
+        re = reduce_equal(a, &val);
     }
     RET[programIndex] = ((int)re << 8) + val;
 }
diff --git a/tests/ref-0.ispc b/tests/ref-0.ispc
index 8c5be7a9..2977c6cf 100644
--- a/tests/ref-0.ispc
+++ b/tests/ref-0.ispc
@@ -3,13 +3,13 @@ export uniform int width() { return programCount; }
 
 
 
-float foo(reference float a) {
-    a = 0;
+float foo(float * uniform a) {
+    *a = 0;
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    foo(a);
+    foo(&a);
     RET[programIndex] = a;
 }
 
diff --git a/tests/ref-1.ispc b/tests/ref-1.ispc
index 46d6a8c2..62ff3210 100644
--- a/tests/ref-1.ispc
+++ b/tests/ref-1.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-float foo(reference uniform float a) {
+float foo(uniform float &a) {
     a = 0;
 }
 
diff --git a/tests/ref-2.ispc b/tests/ref-2.ispc
index e373f734..4f9e8f6d 100644
--- a/tests/ref-2.ispc
+++ b/tests/ref-2.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-float foo(reference float a) {
+float foo(float & a) {
     a = 0;
 }
 
diff --git a/tests/ref-3.ispc b/tests/ref-3.ispc
index 749abfa8..17a6c4fa 100644
--- a/tests/ref-3.ispc
+++ b/tests/ref-3.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a) {
+void foo(float & a) {
     a = 0;
 }
 
diff --git a/tests/ref-4.ispc b/tests/ref-4.ispc
index 084bfe6f..c0a25633 100644
--- a/tests/ref-4.ispc
+++ b/tests/ref-4.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a) {
+void foo(float & a) {
     a = 0;
 }
 
diff --git a/tests/ref-5.ispc b/tests/ref-5.ispc
index fae3994b..b22b428a 100644
--- a/tests/ref-5.ispc
+++ b/tests/ref-5.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a) {
+void foo(float & a) {
     a = 0;
 }
 
diff --git a/tests/ref-6.ispc b/tests/ref-6.ispc
index 5f9c5d87..6f93144f 100644
--- a/tests/ref-6.ispc
+++ b/tests/ref-6.ispc
@@ -3,24 +3,22 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float a[10]) {
-    a[5] = 0;
+void foo(float<3> &a) {
+    a.x = 0;
+    a.y *= 2;
+    a.z = 1;
 }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
-    float x[10];
-    uniform int i;
-    for (i = 0; i < 10; ++i)
-        x[i] = a*b;
+    float<3> v = { a, b, 10 };
     if (a >= 2)
-        foo(x);
-    RET[programIndex] = x[b] + x[9];
+        foo(v);
+    RET[programIndex] = v.x + v.y + v.z;
 }
 
 export void result(uniform float RET[]) { 
-    RET[programIndex] = 5 * (programIndex+1);
-    RET[0] = 10;
-    RET[1] = 10;
+    RET[programIndex] = 11;
+    RET[0] = 16;
 }
 
diff --git a/tests/ref-7.ispc b/tests/ref-7.ispc
index fb9e0eb5..edda3c27 100644
--- a/tests/ref-7.ispc
+++ b/tests/ref-7.ispc
@@ -2,24 +2,23 @@
 export uniform int width() { return programCount; }
 
 
-
-void foo(reference float a[10]) {
-    a[5] = 0;
+void foo(float &a) {
+    a += a;
 }
 
+struct Foo {
+    float a, b, c;
+};
+
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
-    float a = aFOO[programIndex];
-    float x[10];
-    uniform int i;
-    for (i = 0; i < 10; ++i)
-        x[i] = a*b;
-    if (a > 2)
-        foo(x);
-    RET[programIndex] = x[b] + x[9];
+    Foo f = { b, aFOO[programIndex], 1 };
+    if (programIndex < 2)
+        foo(f.b);
+    RET[programIndex] = f.b;
 }
 
 export void result(uniform float RET[]) { 
-    RET[programIndex] = 5 * (programIndex+1);
-    RET[0] = 10;
-    RET[1] = 20;
+    RET[programIndex] = 1+programIndex;
+    RET[0] = 2;
+    RET[1] = 4;
 }
diff --git a/tests/ref-8.ispc b/tests/ref-8.ispc
new file mode 100644
index 00000000..72e7c7e8
--- /dev/null
+++ b/tests/ref-8.ispc
@@ -0,0 +1,24 @@
+
+export uniform int width() { return programCount; }
+
+
+struct Foo {
+    float a, b, c;
+};
+
+void foo(Foo &f) {
+    f.a += f.a;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    Foo f = { b, aFOO[programIndex], 1 };
+    if (programIndex < 2)
+        foo(f);
+    RET[programIndex] = f.a + f.b + f.c;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 7+programIndex;
+    RET[0] = 12;
+    RET[1] = 13;
+}
diff --git a/tests/ref-9.ispc b/tests/ref-9.ispc
new file mode 100644
index 00000000..d41e02da
--- /dev/null
+++ b/tests/ref-9.ispc
@@ -0,0 +1,18 @@
+
+export uniform int width() { return programCount; }
+
+
+int &foo(int &a, int &b) {
+    return b;
+}
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex];
+    int three = 3;
+    int &val = foo(a, three);
+    RET[programIndex] = val;
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 3;
+}
diff --git a/tests/reference-assignment-typeconv.ispc b/tests/reference-assignment-typeconv.ispc
index 21448867..d8219127 100644
--- a/tests/reference-assignment-typeconv.ispc
+++ b/tests/reference-assignment-typeconv.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float x, reference int y) {
+void foo(float &x, int y) {
     x = y;
 }
 
diff --git a/tests/reference-assignment.ispc b/tests/reference-assignment.ispc
index a358ab66..3ce7c343 100644
--- a/tests/reference-assignment.ispc
+++ b/tests/reference-assignment.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float x, reference float y) {
+void foo(float & x, float y) {
     x = y;
 }
 
diff --git a/tests/reference-prepost-increment.ispc b/tests/reference-prepost-increment.ispc
index 55d8eaea..9c9e8f4c 100644
--- a/tests/reference-prepost-increment.ispc
+++ b/tests/reference-prepost-increment.ispc
@@ -3,9 +3,11 @@ export uniform int width() { return programCount; }
 
 
 
-void foo(reference float x) {
-    if (x <= 2)
-        ++x;
+void foo(float & x) {
+    if ((x) <= 2) {
+        ++(x);
+        x += 1;
+    }
 }
 
 export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
@@ -16,6 +18,6 @@ export void f_fu(uniform float ret[], uniform float a[], uniform float b) {
 
 export void result(uniform float r[]) {
     r[programIndex] = 1+programIndex;
-    r[0] = 2;
-    r[1] = 3;
+    r[0] = 3;
+    r[1] = 4;
 }
diff --git a/tests/scatter-struct.ispc b/tests/scatter-struct.ispc
index 2e43fba4..b5d58e16 100644
--- a/tests/scatter-struct.ispc
+++ b/tests/scatter-struct.ispc
@@ -8,8 +8,8 @@ struct Foo {
     float y;
 };
 
-extern void aa(reference Foo f);
-extern void bb(reference Foo f[]);
+extern void aa(Foo * uniform f);
+extern void bb(Foo f[]);
 
 void set(Foo f[], int offset, Foo val) {
     f[offset] = val;
diff --git a/failing_tests/scatter-vector.ispc b/tests/scatter-vector.ispc
similarity index 90%
rename from failing_tests/scatter-vector.ispc
rename to tests/scatter-vector.ispc
index 60ccee94..8882af2b 100644
--- a/failing_tests/scatter-vector.ispc
+++ b/tests/scatter-vector.ispc
@@ -8,9 +8,6 @@ struct Foo {
     float y;
 };
 
-extern void aa(reference Foo f);
-extern void bb(reference Foo f[]);
-
 typedef float<3> float3;
 
 void set(uniform float3 f[], int offset, float3 val) {
diff --git a/tests/short-vec-16.ispc b/tests/short-vec-16.ispc
index 173663a2..c9caab9a 100644
--- a/tests/short-vec-16.ispc
+++ b/tests/short-vec-16.ispc
@@ -1,12 +1,12 @@
 export uniform int width() { return programCount; }
 
-void inc(reference uniform float<4> v) { ++v; }
+void inc(uniform float<4> * uniform v) { ++(*v); }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform float<4> x = {b,b+1,b+2,b+3}, y = {b,b-1,b+2,0};
 
-    inc(x);
+    inc(&x);
     RET[programIndex] = 0;
     if (programIndex < 4)
         RET[programIndex] = x[programIndex];
diff --git a/tests/short-vec-17.ispc b/tests/short-vec-17.ispc
index a62ebc36..0730d14d 100644
--- a/tests/short-vec-17.ispc
+++ b/tests/short-vec-17.ispc
@@ -1,12 +1,12 @@
 export uniform int width() { return programCount; }
 
-void inc(reference uniform float<4> v) { v=2; }
+void inc(uniform float<4> * uniform v) { *v=2; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform float<4> x = {b,b+1,b+2,b+3}, y = {b,b-1,b+2,0};
 
-    inc(x);
+    inc(&x);
     RET[programIndex] = 0;
     if (programIndex < 4)
         RET[programIndex] = x[programIndex];
diff --git a/tests/short-vec-18.ispc b/tests/short-vec-18.ispc
index 3fafec34..3a468808 100644
--- a/tests/short-vec-18.ispc
+++ b/tests/short-vec-18.ispc
@@ -1,12 +1,12 @@
 export uniform int width() { return programCount; }
 
-void inc(reference uniform float<4> v) { v=2; }
+void inc(uniform float<4> * uniform v) { *v=2; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform float<4> x[2] = { {b,b+1,b+2,b+3}, {1,2,3,4} };
 
-    inc(x[0]);
+    inc(&x[0]);
     RET[programIndex] = 0;
     if (programIndex < 4)
         RET[programIndex] = x[1][programIndex];
diff --git a/tests/short-vec-19.ispc b/tests/short-vec-19.ispc
index cc54351b..cfd6f16f 100644
--- a/tests/short-vec-19.ispc
+++ b/tests/short-vec-19.ispc
@@ -1,12 +1,12 @@
 export uniform int width() { return programCount; }
 
-void inc(reference uniform float<4> v) { ++v; }
+void inc(uniform float<4> * uniform v) { ++(*v); }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform float<4> x[2] = { {b,b+1,b+2,b+3}, {1,2,3,4} };
 
-    inc(x[1]);
+    inc(&x[1]);
     RET[programIndex] = 0;
     if (programIndex < 4)
         RET[programIndex] = x[1][programIndex];
diff --git a/tests/short-vec-6.ispc b/tests/short-vec-6.ispc
index 647a95f7..95d22b34 100644
--- a/tests/short-vec-6.ispc
+++ b/tests/short-vec-6.ispc
@@ -3,16 +3,16 @@ typedef int<4> int4;
 
 export uniform int width() { return programCount; }
 
-void inc(reference int4 v) {
+void inc(int4 * uniform v) {
     int4 delta = { 1, 1, 1, 1 };
-    v += delta;
+    (*v) += delta;
 }
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     float a = aa[programIndex];
     int4 v0 = { b, 2*b, 3*b, 2 };
-    inc(v0);
-    if (programIndex & 1) inc(v0);
+    inc(&v0);
+    if (programIndex & 1) inc(&v0);
     ret[programIndex] = v0.z;
 }
 
diff --git a/tests/short-vec-7.ispc b/tests/short-vec-7.ispc
index 1b64b432..abfb96cd 100644
--- a/tests/short-vec-7.ispc
+++ b/tests/short-vec-7.ispc
@@ -3,16 +3,16 @@ typedef int<4> int4;
 
 export uniform int width() { return programCount; }
 
-void incXY(reference int4 v) {
-    ++v.x;
-    ++v.y;
+void incXY(int4 * uniform v) {
+    ++(*v).x;
+    ++(*v).y;
 }
 
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
     float a = aa[programIndex];
     int4 v0 = { b, 2*b, 3*b, 2 };
-    incXY(v0);
-    if (programIndex & 1) incXY(v0);
+    incXY(&v0);
+    if (programIndex & 1) incXY(&v0);
     ret[programIndex] = v0.x + v0.y;
 }
 
diff --git a/tests/sizeof-1.ispc b/tests/sizeof-1.ispc
new file mode 100644
index 00000000..2e8237fc
--- /dev/null
+++ b/tests/sizeof-1.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof(int32); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4*programCount;
+}
diff --git a/tests/sizeof-10.ispc b/tests/sizeof-10.ispc
new file mode 100644
index 00000000..1d3bd11f
--- /dev/null
+++ b/tests/sizeof-10.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    float a;
+    uniform double b;
+};
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Foo array[20];
+    RET[programIndex] = sizeof(array) / sizeof(array[0]);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 20;
+}
diff --git a/tests/sizeof-2.ispc b/tests/sizeof-2.ispc
new file mode 100644
index 00000000..a1d6bb67
--- /dev/null
+++ b/tests/sizeof-2.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof(uniform double); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 8;
+}
diff --git a/tests/sizeof-3.ispc b/tests/sizeof-3.ispc
new file mode 100644
index 00000000..8ccbd918
--- /dev/null
+++ b/tests/sizeof-3.ispc
@@ -0,0 +1,15 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    float a;
+    int8 b;
+};
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof(Foo); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 8*programCount;
+}
diff --git a/tests/sizeof-4.ispc b/tests/sizeof-4.ispc
new file mode 100644
index 00000000..77bbf03a
--- /dev/null
+++ b/tests/sizeof-4.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    uniform float a;
+    uniform int8 b;
+    uniform float c;
+};
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof(Foo); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 12;
+}
diff --git a/tests/sizeof-5.ispc b/tests/sizeof-5.ispc
new file mode 100644
index 00000000..6473e5bc
--- /dev/null
+++ b/tests/sizeof-5.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+struct Foo {
+    uniform int16 a;
+    uniform unsigned int16 c;
+};
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    Foo a[10];
+    RET[programIndex] = sizeof(a);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 40;
+}
diff --git a/tests/sizeof-6.ispc b/tests/sizeof-6.ispc
new file mode 100644
index 00000000..fe1e1293
--- /dev/null
+++ b/tests/sizeof-6.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int * varying ptr = NULL;
+    RET[programIndex] = sizeof(ptr) - sizeof(uniform int * varying); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0;
+}
diff --git a/tests/sizeof-7.ispc b/tests/sizeof-7.ispc
new file mode 100644
index 00000000..50b3b674
--- /dev/null
+++ b/tests/sizeof-7.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof(uniform int32); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/sizeof-8.ispc b/tests/sizeof-8.ispc
new file mode 100644
index 00000000..1f6cc799
--- /dev/null
+++ b/tests/sizeof-8.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof((int)(2*10));
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/sizeof-9.ispc b/tests/sizeof-9.ispc
new file mode 100644
index 00000000..ad96fab2
--- /dev/null
+++ b/tests/sizeof-9.ispc
@@ -0,0 +1,10 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    RET[programIndex] = sizeof 1;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
diff --git a/tests/struct-ref-lvalue.ispc b/tests/struct-ref-lvalue.ispc
index 21f2374c..cab05f25 100644
--- a/tests/struct-ref-lvalue.ispc
+++ b/tests/struct-ref-lvalue.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 
 struct Foo { float f; };
 
-void f(reference uniform Foo foo[], float a) {
+void f(uniform Foo foo[], float a) {
     ++foo[a].f;
 }
 
diff --git a/tests/struct-test-122.ispc b/tests/struct-test-122.ispc
index 830aed51..f639a038 100644
--- a/tests/struct-test-122.ispc
+++ b/tests/struct-test-122.ispc
@@ -5,12 +5,12 @@ export uniform int width() { return programCount; }
 struct Foo {
     float x;
 };
-float bar(reference varying Foo f) { ++f.x; return f.x; }
+float bar(varying Foo * uniform f) { ++((*f).x); return (*f).x; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     varying Foo f = {a};
-    bar(f);
+    bar(&f);
     RET[programIndex] = f.x;
 }
 
diff --git a/tests/struct-test-123.ispc b/tests/struct-test-123.ispc
index e690d937..edc28c50 100644
--- a/tests/struct-test-123.ispc
+++ b/tests/struct-test-123.ispc
@@ -5,13 +5,13 @@ export uniform int width() { return programCount; }
 struct Foo {
     float x;
 };
-void bar(reference varying Foo f) { ++f.x; }
+void bar(varying Foo * uniform f) { ++((*f).x); }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     varying Foo f = {a};
     if (a == 1 || a == 4)
-        bar(f);
+        bar(&f);
     RET[programIndex] = f.x;
 }
 
diff --git a/tests/test-132.ispc b/tests/test-132.ispc
index def5c4f6..13c0a95d 100644
--- a/tests/test-132.ispc
+++ b/tests/test-132.ispc
@@ -3,6 +3,7 @@ export uniform int width() { return programCount; }
 
 
 export void f_v(uniform float RET[]) { 
+    RET[programIndex] = 0;
     cif (true) return; 
     else RET[programIndex] = 1234.; 
 }
diff --git a/tests/test-87.ispc b/tests/test-87.ispc
index f0c22473..24ddbc7d 100644
--- a/tests/test-87.ispc
+++ b/tests/test-87.ispc
@@ -2,7 +2,11 @@
 export uniform int width() { return programCount; }
 
 
-export void f_v(uniform float RET[]) { if (true) return; else RET[programIndex] = 1234.; }
+export void f_v(uniform float RET[]) { 
+    RET[programIndex] = 0;
+    if (true) return; 
+    else RET[programIndex] = 1234.; 
+}
 
 export void result(uniform float RET[]) {
     RET[programIndex] = 0.000000;
diff --git a/tests/unif-struct-test-122.ispc b/tests/unif-struct-test-122.ispc
index 98ab6a8e..30e056b1 100644
--- a/tests/unif-struct-test-122.ispc
+++ b/tests/unif-struct-test-122.ispc
@@ -5,12 +5,12 @@ export uniform int width() { return programCount; }
 struct Foo {
     float x;
 };
-float bar(reference uniform struct Foo f) { ++f.x; return f.x; }
+float bar(uniform struct Foo * uniform f) { ++((*f).x); return (*f).x; }
 
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform struct Foo f = {a};
-    bar(f);
+    bar(&f);
     RET[programIndex] = f.x;
 }
 
diff --git a/tests/unif-struct-test-123.ispc b/tests/unif-struct-test-123.ispc
index 79cc0cd8..e9fe4acf 100644
--- a/tests/unif-struct-test-123.ispc
+++ b/tests/unif-struct-test-123.ispc
@@ -5,12 +5,12 @@ export uniform int width() { return programCount; }
 struct Foo {
     float x;
 };
-void bar(reference uniform struct Foo f) { ++f.x; }
+void bar(uniform struct Foo * uniform f) { ++((*f).x); }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float a = aFOO[programIndex];
     uniform struct Foo f = {a};
     if (a == 1 || a == 4)
-        bar(f);
+        bar(&f);
     RET[programIndex] = f.x;
 }
 
diff --git a/tests_errors/array-increment.ispc b/tests_errors/array-increment.ispc
new file mode 100644
index 00000000..7872c71f
--- /dev/null
+++ b/tests_errors/array-increment.ispc
@@ -0,0 +1,6 @@
+// Can only pre/post increment numeric and pointer types
+
+void foo() {
+    float a[5] = { 1,2,3,4,5};
+    ++a;
+}
diff --git a/tests_errors/array-pointer-assign.ispc b/tests_errors/array-pointer-assign.ispc
new file mode 100644
index 00000000..11744640
--- /dev/null
+++ b/tests_errors/array-pointer-assign.ispc
@@ -0,0 +1,6 @@
+// ffofoof
+
+void foo(float *x) {
+    float a[5] = { 1,2,3,4,5};
+    a = x;
+}
diff --git a/tests_errors/assign-struct-with-const-member.ispc b/tests_errors/assign-struct-with-const-member.ispc
new file mode 100644
index 00000000..396fb1f6
--- /dev/null
+++ b/tests_errors/assign-struct-with-const-member.ispc
@@ -0,0 +1,10 @@
+// ffofoof
+
+struct Foo {
+    const int a;
+};
+
+void foo(Foo f) {
+    Foo a;
+    a = f;
+}
diff --git a/tests_errors/const-1.ispc b/tests_errors/const-1.ispc
index 20b5bb8d..f36276b5 100644
--- a/tests_errors/const-1.ispc
+++ b/tests_errors/const-1.ispc
@@ -1,4 +1,4 @@
-// Can't pre-increment
+// Can't assign to type "const int32" on left-hand
 
 int func() {
     const int x = 2;
diff --git a/tests_errors/const-global.ispc b/tests_errors/const-global.ispc
new file mode 100644
index 00000000..8a814a29
--- /dev/null
+++ b/tests_errors/const-global.ispc
@@ -0,0 +1,7 @@
+//  Can't assign to type "const int32" on left-hand side
+
+const int x = 0;
+
+void foo() {
+    ++x;
+}
diff --git a/tests_errors/deref-1.ispc b/tests_errors/deref-1.ispc
new file mode 100644
index 00000000..256eff87
--- /dev/null
+++ b/tests_errors/deref-1.ispc
@@ -0,0 +1,5 @@
+// operator "." can't be used with expression of "int32" type
+
+int func(int *a) {
+    a.x = 0;
+}
diff --git a/tests_errors/deref-2.ispc b/tests_errors/deref-2.ispc
new file mode 100644
index 00000000..b28cd28c
--- /dev/null
+++ b/tests_errors/deref-2.ispc
@@ -0,0 +1,7 @@
+// operator "." can't be applied to pointer type
+
+struct Foo { int x; };
+
+int func(Foo *a) {
+    a.x = 0;
+}
diff --git a/tests_errors/deref-3.ispc b/tests_errors/deref-3.ispc
new file mode 100644
index 00000000..8e632dfb
--- /dev/null
+++ b/tests_errors/deref-3.ispc
@@ -0,0 +1,7 @@
+// Dereference operator "->" can't be applied to non-pointer type "uniform struct Foo"
+
+struct Foo { int x; };
+
+int func(Foo a) {
+    a->x = 0;
+}
diff --git a/tests_errors/deref.ispc b/tests_errors/deref.ispc
new file mode 100644
index 00000000..f5d571c9
--- /dev/null
+++ b/tests_errors/deref.ispc
@@ -0,0 +1,5 @@
+// Member operator "->" can't be used with expression of "int32" type
+
+int func(int *a) {
+    a->x = 0;
+}
diff --git a/tests_errors/func-param-mismatch-2.ispc b/tests_errors/func-param-mismatch-2.ispc
new file mode 100644
index 00000000..09b27064
--- /dev/null
+++ b/tests_errors/func-param-mismatch-2.ispc
@@ -0,0 +1,7 @@
+// Unable to find matching overload for call to function
+
+void foo(int x);
+
+void bar(int x) {
+    foo();
+}
diff --git a/tests_errors/func-param-mismatch-3.ispc b/tests_errors/func-param-mismatch-3.ispc
new file mode 100644
index 00000000..7e5f2b99
--- /dev/null
+++ b/tests_errors/func-param-mismatch-3.ispc
@@ -0,0 +1,7 @@
+// Unable to find matching overload for call to function
+
+void foo(int x);
+
+void bar(int x) {
+    foo(x, x);
+}
diff --git a/tests_errors/func-param-mismatch.ispc b/tests_errors/func-param-mismatch.ispc
new file mode 100644
index 00000000..c2bac94f
--- /dev/null
+++ b/tests_errors/func-param-mismatch.ispc
@@ -0,0 +1,7 @@
+// Unable to find matching overload for call to function
+
+void foo();
+
+void bar(int x) {
+    foo(x);
+}
diff --git a/tests_errors/initexpr-2.ispc b/tests_errors/initexpr-2.ispc
index 1ee1208f..e36a18a7 100644
--- a/tests_errors/initexpr-2.ispc
+++ b/tests_errors/initexpr-2.ispc
@@ -1,3 +1,3 @@
-// Initializer list for array "int32[2][4]" must have 2 elements (has 3). 
+// Initializer list for array "int32[4]" must have 4 elements (has 3)
 
 int a[2][4] = { { 1, 2, 3 }, { 1, 2, 3, 4 }, 1 };
diff --git a/tests_errors/initexpr-3.ispc b/tests_errors/initexpr-3.ispc
index 2c2a0c27..201f6f2f 100644
--- a/tests_errors/initexpr-3.ispc
+++ b/tests_errors/initexpr-3.ispc
@@ -1,4 +1,4 @@
-// Inconsistent expression list lengths found in initializer list
+// Inconsistent initializer expression list lengths make it impossible to size unsized array dimensions
 
 void foo() {
     int a[2][] = { { 1, 2, 3 }, { 1, 2, 3, 4 } };
diff --git a/tests_errors/initexpr-5.ispc b/tests_errors/initexpr-5.ispc
new file mode 100644
index 00000000..49d32b6c
--- /dev/null
+++ b/tests_errors/initexpr-5.ispc
@@ -0,0 +1,4 @@
+// Inconsistent initializer expression list lengths make it impossible
+
+int a[2][] = { { 1, 2, 3 }, { 1, 2, 3, 4 } };
+
diff --git a/tests_errors/ptr-1.ispc b/tests_errors/ptr-1.ispc
new file mode 100644
index 00000000..66a9bff4
--- /dev/null
+++ b/tests_errors/ptr-1.ispc
@@ -0,0 +1,5 @@
+// Can't convert between incompatible pointer types
+
+int *foo(void *p) {
+    return p;
+}
diff --git a/tests_errors/ptr-2.ispc b/tests_errors/ptr-2.ispc
new file mode 100644
index 00000000..27a93aa4
--- /dev/null
+++ b/tests_errors/ptr-2.ispc
@@ -0,0 +1,5 @@
+// Illegal to pre/post increment
+
+void *foo(void *p) {
+    ++p;
+}
diff --git a/tests_errors/ptr-3.ispc b/tests_errors/ptr-3.ispc
new file mode 100644
index 00000000..698d89ca
--- /dev/null
+++ b/tests_errors/ptr-3.ispc
@@ -0,0 +1,6 @@
+// Illegal to perform pointer arithmetic
+
+void *foo(void *p) {
+    p = p-2;
+    return p;
+}
diff --git a/tests_errors/ptr-4.ispc b/tests_errors/ptr-4.ispc
new file mode 100644
index 00000000..7b168dc4
--- /dev/null
+++ b/tests_errors/ptr-4.ispc
@@ -0,0 +1,6 @@
+// Illegal to perform pointer arithmetic
+
+void *foo(void *p) {
+    p += 1;
+    return p;
+}
diff --git a/tests_errors/ptr-const-1.ispc b/tests_errors/ptr-const-1.ispc
new file mode 100644
index 00000000..b334fbe8
--- /dev/null
+++ b/tests_errors/ptr-const-1.ispc
@@ -0,0 +1,5 @@
+// Can't assign to type "const int32 * const"
+
+void foo(const int * const p) {
+    ++p;
+}
diff --git a/tests_errors/ptr-const.ispc b/tests_errors/ptr-const.ispc
new file mode 100644
index 00000000..72f9aba2
--- /dev/null
+++ b/tests_errors/ptr-const.ispc
@@ -0,0 +1,5 @@
+// Can't assign to type "const int32" on left-hand side
+
+void foo(const int * p) {
+    *p = 0;
+}
diff --git a/tests_errors/ptr-incompat-types.ispc b/tests_errors/ptr-incompat-types.ispc
new file mode 100644
index 00000000..a76c10e2
--- /dev/null
+++ b/tests_errors/ptr-incompat-types.ispc
@@ -0,0 +1,12 @@
+// Conversion between incompatible pointer types
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    uniform int8 * varying pa = (uniform int8 *)aFOO;
+    RET[programIndex] = aFOO - pa;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 40;
+}
diff --git a/tests_errors/ref-1.ispc b/tests_errors/ref-1.ispc
new file mode 100644
index 00000000..7605fc90
--- /dev/null
+++ b/tests_errors/ref-1.ispc
@@ -0,0 +1,6 @@
+// Syntax error--token "&" unexpected
+
+int foo(int & & bar) {
+    bar = 0;
+    return bar;
+}
diff --git a/tests_errors/ref-2.ispc b/tests_errors/ref-2.ispc
new file mode 100644
index 00000000..fe518059
--- /dev/null
+++ b/tests_errors/ref-2.ispc
@@ -0,0 +1,6 @@
+// Must provide initializer for reference-type variable
+
+void func(int a) {
+    int &b;
+    b = 0;
+}
diff --git a/tests_errors/ref-3.ispc b/tests_errors/ref-3.ispc
new file mode 100644
index 00000000..2a2a169b
--- /dev/null
+++ b/tests_errors/ref-3.ispc
@@ -0,0 +1,5 @@
+// Syntax error--token "*" unexpected
+
+void foo(int & * x) {
+    *x = NULL;
+}
diff --git a/tests_errors/ref-const-array.ispc b/tests_errors/ref-const-array.ispc
new file mode 100644
index 00000000..a6c2c4bc
--- /dev/null
+++ b/tests_errors/ref-const-array.ispc
@@ -0,0 +1,7 @@
+// Can't assign to type "const int32" on left-hand side of expression
+
+const int x[20];
+
+void foo() {
+    ++x[5];
+}
diff --git a/tests_errors/ref-const-struct-1.ispc b/tests_errors/ref-const-struct-1.ispc
new file mode 100644
index 00000000..a1789fd6
--- /dev/null
+++ b/tests_errors/ref-const-struct-1.ispc
@@ -0,0 +1,9 @@
+// Can't assign to type "const int32" on left-hand side of expression
+
+struct Foo {
+    int x;
+};
+
+void f(const Foo &f) {
+    f.x += 2;
+}
diff --git a/tests_errors/ref-const-struct.ispc b/tests_errors/ref-const-struct.ispc
new file mode 100644
index 00000000..078afc12
--- /dev/null
+++ b/tests_errors/ref-const-struct.ispc
@@ -0,0 +1,9 @@
+// Can't assign to type "const int32" on left-hand side of expression
+
+struct Foo {
+    int x;
+};
+
+void f(const Foo &f) {
+    ++f.x;
+}
diff --git a/tests_errors/ref-const.ispc b/tests_errors/ref-const.ispc
new file mode 100644
index 00000000..134c7e02
--- /dev/null
+++ b/tests_errors/ref-const.ispc
@@ -0,0 +1,5 @@
+// Can't assign to type "const uniform int32" on left-hand side of expression
+
+void foo(const uniform int &x) {
+    x = 0;
+}
diff --git a/tests_errors/ref-varying.ispc b/tests_errors/ref-varying.ispc
new file mode 100644
index 00000000..e7c53dae
--- /dev/null
+++ b/tests_errors/ref-varying.ispc
@@ -0,0 +1,9 @@
+// fff
+
+void foo(float &x) {
+    ++x;
+}
+
+void bar(uniform float a[], int i) {
+    foo(a[i]);
+}
diff --git a/tests_errors/struct-unsized-array.ispc b/tests_errors/struct-unsized-array.ispc
new file mode 100644
index 00000000..77553eff
--- /dev/null
+++ b/tests_errors/struct-unsized-array.ispc
@@ -0,0 +1,13 @@
+// Unsized arrays aren't allowed in struct definitions
+
+struct Foo {
+    float a[];
+    float b[10];
+};
+
+Foo f;
+
+void Func() {
+    Foo f;
+}
+
diff --git a/tests_errors/struct-with-repeated-member-name.ispc b/tests_errors/struct-with-repeated-member-name.ispc
new file mode 100644
index 00000000..134ae3b7
--- /dev/null
+++ b/tests_errors/struct-with-repeated-member-name.ispc
@@ -0,0 +1,5 @@
+// ffofoof
+
+struct Foo {
+    int a, a;
+};
diff --git a/tests_errors/varying-lvalue-to-ref.ispc b/tests_errors/varying-lvalue-to-ref.ispc
new file mode 100644
index 00000000..0944de6e
--- /dev/null
+++ b/tests_errors/varying-lvalue-to-ref.ispc
@@ -0,0 +1,7 @@
+// ffofoof
+
+void inc(float &x) { ++x; }
+
+void foo(uniform float a[], int index) {
+    inc(a[index]);
+}
diff --git a/type.cpp b/type.cpp
index f8fa41ac..0217a598 100644
--- a/type.cpp
+++ b/type.cpp
@@ -709,6 +709,25 @@ PointerType::PointerType(const Type *t, bool iu, bool ic)
 }
 
 
+PointerType *
+PointerType::GetUniform(const Type *t) {
+    return new PointerType(t, true, false);
+}
+
+
+PointerType *
+PointerType::GetVarying(const Type *t) {
+    return new PointerType(t, false, false);
+}
+
+
+bool
+PointerType::IsVoidPointer(const Type *t) {
+    return Type::EqualIgnoringConst(t->GetAsUniformType(),
+                                    PointerType::Void);
+}
+
+
 bool
 PointerType::IsUniformType() const {
     return isUniform;
@@ -822,9 +841,11 @@ PointerType::GetCDeclaration(const std::string &name) const {
     if (baseType == NULL)
         return "";
 
-    std::string ret = baseType->GetCDeclaration(name);
+    std::string ret = baseType->GetCDeclaration("");
     ret += std::string(" *");
     if (isConst) ret += " const";
+    ret += std::string(" ");
+    ret += name;
     return ret;
 }
 
@@ -834,6 +855,9 @@ PointerType::LLVMType(llvm::LLVMContext *ctx) const {
     if (baseType == NULL)
         return NULL;
 
+    if (isUniform == false)
+        return LLVMTypes::VoidPointerVectorType;
+
     LLVM_TYPE_CONST llvm::Type *ptype = NULL;
     const FunctionType *ftype = dynamic_cast<const FunctionType *>(baseType);
     if (ftype != NULL) 
@@ -841,15 +865,40 @@ PointerType::LLVMType(llvm::LLVMContext *ctx) const {
         // last parameter--i.e. we don't allow taking function pointers of
         // exported functions.
         ptype = llvm::PointerType::get(ftype->LLVMFunctionType(ctx, true), 0);
-    else
-        ptype = llvm::PointerType::get(baseType->LLVMType(ctx), 0);
+    else {
+        if (baseType == AtomicType::Void)
+            ptype = LLVMTypes::VoidPointerType;
+        else
+            ptype = llvm::PointerType::get(baseType->LLVMType(ctx), 0);
+    }
 
-    if (isUniform)
-        return ptype;
-    else
-        // Varying pointers are represented as arrays of pointers since
-        // LLVM doesn't allow vectors of pointers.
-        return llvm::ArrayType::get(ptype, g->target.vectorWidth);
+    return ptype;
+}
+
+
+static llvm::DIType 
+lCreateDIArray(llvm::DIType eltType, int count) {
+    int lowerBound = 0, upperBound = count-1;
+
+    if (count == 0) {
+        // unsized array -> indicate with low > high
+        lowerBound = 1;
+        upperBound = 0;
+    }
+
+    llvm::Value *sub = m->diBuilder->getOrCreateSubrange(lowerBound, upperBound);
+    std::vector<llvm::Value *> subs;
+    subs.push_back(sub);
+#ifdef LLVM_2_9
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(&subs[0], subs.size());
+#else
+    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs);
+#endif
+
+    uint64_t size = eltType.getSizeInBits() * count;
+    uint64_t align = eltType.getAlignInBits();
+
+    return m->diBuilder->createArrayType(size, align, eltType, subArray);
 }
 
 
@@ -859,8 +908,15 @@ PointerType::GetDIType(llvm::DIDescriptor scope) const {
         return llvm::DIType();
 
     llvm::DIType diTargetType = baseType->GetDIType(scope);
-    int bitsSize = g->target.is32bit ? 32 : 64;
-    return m->diBuilder->createPointerType(diTargetType, bitsSize);
+    int bitsSize = g->target.is32Bit ? 32 : 64;
+    if (isUniform)
+        return m->diBuilder->createPointerType(diTargetType, bitsSize);
+    else {
+        // emit them as an array of pointers
+        llvm::DIType eltType = m->diBuilder->createPointerType(diTargetType, 
+                                                               bitsSize);
+        return lCreateDIArray(eltType, g->target.vectorWidth);
+    }
 }
 
 
@@ -1071,28 +1127,7 @@ ArrayType::GetDIType(llvm::DIDescriptor scope) const {
         return llvm::DIType();
 
     llvm::DIType eltType = child->GetDIType(scope);
-
-    int lowerBound = 0, upperBound = numElements-1;
-    if (numElements == 0) {
-        // unsized array -> indicate with low > high
-        lowerBound = 1;
-        upperBound = 0;
-    }
-
-    llvm::Value *sub = m->diBuilder->getOrCreateSubrange(lowerBound, upperBound);
-    std::vector<llvm::Value *> subs;
-    subs.push_back(sub);
-#ifdef LLVM_2_9
-    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(&subs[0], subs.size());
-#else
-    llvm::DIArray subArray = m->diBuilder->getOrCreateArray(subs);
-#endif
-
-    // it's intentional that size is zero for unsized arrays
-    uint64_t size = eltType.getSizeInBits() * numElements;
-    uint64_t align = eltType.getAlignInBits();
-
-    return m->diBuilder->createArrayType(size, align, eltType, subArray);
+    return lCreateDIArray(eltType, numElements);
 }
 
 
@@ -1112,30 +1147,49 @@ ArrayType::SizeUnsizedArrays(const Type *type, Expr *initExpr) {
     ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
     if (exprList == NULL || exprList->exprs.size() == 0)
         return type;
-    
+
+    // If the current dimension is unsized, then size it according to the
+    // length of the expression list
     if (at->GetElementCount() == 0)
         type = at->GetSizedArray(exprList->exprs.size());
 
+    // Is there another nested level of expression lists?  If not, bail out
+    // now.  Otherwise we'll use the first one to size the next dimension
+    // (after checking below that it has the same length as all of the
+    // other ones.
     ExprList *nextList = dynamic_cast<ExprList *>(exprList->exprs[0]);
     if (nextList == NULL)
         return type;
 
-    unsigned int nextSize = nextList->exprs.size();
-    for (unsigned int i = 1; i < exprList->exprs.size(); ++i) {
-        if (exprList->exprs[i] == NULL) {
-            assert(m->errorCount > 0);
-            continue;
-        }
+    const Type *nextType = at->GetElementType();
+    const ArrayType *nextArrayType = 
+        dynamic_cast<const ArrayType *>(nextType);
+    if (nextArrayType != NULL && nextArrayType->GetElementCount() == 0) {
+        // If the recursive call to SizeUnsizedArrays at the bottom of the
+        // function is going to size an unsized dimension, make sure that
+        // all of the sub-expression lists are the same length--i.e. issue
+        // an error if we have something like
+        // int x[][] = { { 1 }, { 1, 2, 3, 4 } };
+        unsigned int nextSize = nextList->exprs.size();
+        for (unsigned int i = 1; i < exprList->exprs.size(); ++i) {
+            if (exprList->exprs[i] == NULL) {
+                // We should have seen an error earlier in this case.
+                assert(m->errorCount > 0);
+                continue;
+            }
 
-        ExprList *el = dynamic_cast<ExprList *>(exprList->exprs[i]);
-        if (el == NULL || el->exprs.size() != nextSize) {
-            Error(Union(exprList->exprs[0]->pos, exprList->exprs[i]->pos), 
-                  "Inconsistent expression list lengths found in initializer "
-                  "list.");
-            return NULL;
+            ExprList *el = dynamic_cast<ExprList *>(exprList->exprs[i]);
+            if (el == NULL || el->exprs.size() != nextSize) {
+                Error(Union(exprList->exprs[0]->pos, exprList->exprs[i]->pos), 
+                      "Inconsistent initializer expression list lengths "
+                      "make it impossible to size unsized array dimensions.");
+                return NULL;
+            }
         }
     }
 
+    // Recursively call SizeUnsizedArrays() to get the child type for the
+    // array that we were able to size here.
     return new ArrayType(SizeUnsizedArrays(at->GetElementType(), nextList),
                          exprList->exprs.size());
 }
@@ -1632,6 +1686,8 @@ StructType::LLVMType(llvm::LLVMContext *ctx) const {
     std::vector<LLVM_TYPE_CONST llvm::Type *> llvmTypes;
     for (int i = 0; i < GetElementCount(); ++i) {
         const Type *type = GetElementType(i);
+        if (type == NULL)
+            return NULL;
         llvmTypes.push_back(type->LLVMType(ctx));
     }
     return llvm::StructType::get(*ctx, llvmTypes);
@@ -1732,8 +1788,8 @@ StructType::GetElementNumber(const std::string &n) const {
 ///////////////////////////////////////////////////////////////////////////
 // ReferenceType
 
-ReferenceType::ReferenceType(const Type *t, bool ic) 
-    : isConst(ic), targetType(t->GetAsNonConstType()) {
+ReferenceType::ReferenceType(const Type *t) 
+    : targetType(t) {
 }
 
 
@@ -1769,7 +1825,7 @@ ReferenceType::IsUnsignedType() const {
 
 bool
 ReferenceType::IsConstType() const {
-    return isConst; 
+    return targetType->IsConstType();
 }
 
 
@@ -1789,7 +1845,7 @@ const ReferenceType *
 ReferenceType::GetAsVaryingType() const {
     if (IsVaryingType()) 
         return this;
-    return new ReferenceType(targetType->GetAsVaryingType(), isConst);
+    return new ReferenceType(targetType->GetAsVaryingType());
 }
 
 
@@ -1797,13 +1853,13 @@ const ReferenceType *
 ReferenceType::GetAsUniformType() const {
     if (IsUniformType()) 
         return this;
-    return new ReferenceType(targetType->GetAsUniformType(), isConst);
+    return new ReferenceType(targetType->GetAsUniformType());
 }
 
 
 const Type *
 ReferenceType::GetSOAType(int width) const {
-    return new ReferenceType(targetType->GetSOAType(width), isConst);
+    return new ReferenceType(targetType->GetSOAType(width));
 }
 
 
@@ -1811,7 +1867,7 @@ const ReferenceType *
 ReferenceType::GetAsConstType() const {
     if (IsConstType())
         return this;
-    return new ReferenceType(targetType, true);
+    return new ReferenceType(targetType->GetAsConstType());
 }
 
 
@@ -1819,17 +1875,18 @@ const ReferenceType *
 ReferenceType::GetAsNonConstType() const {
     if (!IsConstType())
         return this;
-    return new ReferenceType(targetType, false);
+    return new ReferenceType(targetType->GetAsNonConstType());
 }
 
 
 std::string
 ReferenceType::GetString() const {
-    std::string ret;
-    if (isConst || targetType->IsConstType())
-        ret += "const ";
-    ret += std::string("reference<") + targetType->GetAsNonConstType()->GetString() + 
-        std::string(">");
+    if (targetType == NULL)
+        return "";
+
+    std::string ret = targetType->GetString();
+
+    ret += std::string(" &");
     return ret;
 }
 
@@ -1837,8 +1894,6 @@ ReferenceType::GetString() const {
 std::string
 ReferenceType::Mangle() const {
     std::string ret;
-    if (isConst)
-        ret += "C";
     ret += std::string("REF") + targetType->Mangle();
     return ret;
 }
@@ -1851,8 +1906,6 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
         if (at->GetElementCount() == 0) {
             // emit unsized arrays as pointers to the base type..
             std::string ret;
-            if (isConst || at->GetElementType()->IsConstType())
-                ret += "const ";
             ret += at->GetElementType()->GetAsNonConstType()->GetCDeclaration("") + 
                 std::string(" *");
             if (lShouldPrintName(name))
@@ -1866,10 +1919,7 @@ ReferenceType::GetCDeclaration(const std::string &name) const {
     }
     else {
         std::string ret;
-        if (isConst || targetType->IsConstType())
-            ret += "const ";
-        ret += targetType->GetAsNonConstType()->GetCDeclaration("") + 
-            std::string(" *");
+        ret += targetType->GetCDeclaration("") + std::string(" *");
         if (lShouldPrintName(name))
             ret += name;
         return ret;
@@ -1901,10 +1951,9 @@ ReferenceType::GetDIType(llvm::DIDescriptor scope) const {
 FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a, 
                            SourcePos p)
     : isTask(false), isExported(false), isExternC(false), returnType(r), 
-      argTypes(a), argNames(std::vector<std::string>(a.size(), "")),
-      argDefaults(std::vector<ConstExpr *>(a.size(), NULL)),
-      argPos(std::vector<SourcePos>(a.size(), p)),
-      pos(p) {
+      paramTypes(a), paramNames(std::vector<std::string>(a.size(), "")),
+      paramDefaults(std::vector<ConstExpr *>(a.size(), NULL)),
+      paramPositions(std::vector<SourcePos>(a.size(), p)) {
     assert(returnType != NULL);
 }
 
@@ -1914,11 +1963,11 @@ FunctionType::FunctionType(const Type *r, const std::vector<const Type *> &a,
                            const std::vector<ConstExpr *> &ad,
                            const std::vector<SourcePos> &ap,
                            bool it, bool is, bool ec) 
-    : isTask(it), isExported(is), isExternC(ec), returnType(r), argTypes(a), 
-      argNames(an), argDefaults(ad), argPos(ap), pos(p) {
-    assert(argTypes.size() == argNames.size() && 
-           argNames.size() == argDefaults.size() &&
-           argDefaults.size() == argPos.size());
+    : isTask(it), isExported(is), isExternC(ec), returnType(r), paramTypes(a), 
+      paramNames(an), paramDefaults(ad), paramPositions(ap) {
+    assert(paramTypes.size() == paramNames.size() && 
+           paramNames.size() == paramDefaults.size() &&
+           paramDefaults.size() == paramPositions.size());
     assert(returnType != NULL);
 }
 
@@ -2007,9 +2056,9 @@ FunctionType::GetString() const {
     if (isTask) ret += "task ";
     ret += returnType->GetString();
     ret += "(";
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
-        ret += argTypes[i]->GetString();
-        if (i != argTypes.size() - 1)
+    for (unsigned int i = 0; i < paramTypes.size(); ++i) {
+        ret += paramTypes[i]->GetString();
+        if (i != paramTypes.size() - 1)
             ret += ", ";
     }
     ret += ")";
@@ -2020,8 +2069,8 @@ FunctionType::GetString() const {
 std::string
 FunctionType::Mangle() const {
     std::string ret = "___";
-    for (unsigned int i = 0; i < argTypes.size(); ++i)
-        ret += argTypes[i]->Mangle();
+    for (unsigned int i = 0; i < paramTypes.size(); ++i)
+        ret += paramTypes[i]->Mangle();
     return ret;
 }
 
@@ -2033,12 +2082,23 @@ FunctionType::GetCDeclaration(const std::string &fname) const {
     ret += " ";
     ret += fname;
     ret += "(";
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
-        if (argNames[i] != "")
-            ret += argTypes[i]->GetCDeclaration(argNames[i]);
+    for (unsigned int i = 0; i < paramTypes.size(); ++i) {
+        const Type *type = paramTypes[i];
+
+        // Convert pointers to arrays to unsized arrays, which are more clear
+        // to print out for multidimensional arrays (i.e. "float foo[][4] "
+        // versus "float (foo *)[4]").
+        const PointerType *pt = dynamic_cast<const PointerType *>(type);
+        if (pt != NULL && 
+            dynamic_cast<const ArrayType *>(pt->GetBaseType()) != NULL) {
+            type = new ArrayType(pt->GetBaseType(), 0);
+        }
+
+        if (paramNames[i] != "")
+            ret += type->GetCDeclaration(paramNames[i]);
         else
-            ret += argTypes[i]->GetString();
-        if (i != argTypes.size() - 1)
+            ret += type->GetString();
+        if (i != paramTypes.size() - 1)
             ret += ", ";
     }
     ret += ")";
@@ -2063,20 +2123,17 @@ FunctionType::GetDIType(llvm::DIDescriptor scope) const {
 
 LLVM_TYPE_CONST llvm::FunctionType *
 FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
-    if (!includeMask && isTask) {
-        Error(pos, "Function can't have both \"task\" and \"export\" qualifiers");
-        return NULL;
-    }
+    if (isTask == true) assert(includeMask == true);
 
     // Get the LLVM Type *s for the function arguments
     std::vector<LLVM_TYPE_CONST llvm::Type *> llvmArgTypes;
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
-        if (!argTypes[i])
+    for (unsigned int i = 0; i < paramTypes.size(); ++i) {
+        if (!paramTypes[i])
             return NULL;
-        assert(argTypes[i] != AtomicType::Void);
+        assert(paramTypes[i] != AtomicType::Void);
 
-        LLVM_TYPE_CONST llvm::Type *t = argTypes[i]->LLVMType(ctx);
-        if (!t)
+        LLVM_TYPE_CONST llvm::Type *t = paramTypes[i]->LLVMType(ctx);
+        if (t == NULL)
             return NULL;
         llvmArgTypes.push_back(t);
     }
@@ -2107,10 +2164,31 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const {
 }
 
 
-std::string
-FunctionType::GetArgumentName(int i) const { 
-    assert(i < (int)argNames.size());
-    return argNames[i]; 
+const Type *
+FunctionType::GetParameterType(int i) const { 
+    assert(i < (int)paramTypes.size());
+    return paramTypes[i];
+}
+
+
+ConstExpr *
+FunctionType::GetParameterDefault(int i) const { 
+    assert(i < (int)paramDefaults.size());
+    return paramDefaults[i]; 
+}
+
+
+const SourcePos &
+FunctionType::GetParameterSourcePos(int i) const { 
+    assert(i < (int)paramPositions.size());
+    return paramPositions[i];
+}
+
+
+const std::string &
+FunctionType::GetParameterName(int i) const { 
+    assert(i < (int)paramNames.size());
+    return paramNames[i]; 
 }
 
 
@@ -2196,17 +2274,15 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
 
     // Not the same types, but only a const/non-const difference?  Return
     // the non-const type as the more general one.
-    if (Type::Equal(t0->GetAsConstType(), t1->GetAsConstType()))
+    if (Type::EqualIgnoringConst(t0, t1))
         return t0->GetAsNonConstType();
 
     const PointerType *pt0 = dynamic_cast<const PointerType *>(t0);
     const PointerType *pt1 = dynamic_cast<const PointerType *>(t1);
     if (pt0 != NULL && pt1 != NULL) {
-        if (Type::Equal(pt0->GetAsUniformType()->GetAsConstType(),
-                        PointerType::Void))
+        if (PointerType::IsVoidPointer(pt0))
             return pt1;
-        else if (Type::Equal(pt1->GetAsUniformType()->GetAsConstType(),
-                             PointerType::Void))
+        else if (PointerType::IsVoidPointer(pt1))
             return pt0;
         else {
             Error(pos, "Conversion between incompatible pointer types \"%s\" "
@@ -2318,11 +2394,18 @@ Type::MoreGeneralType(const Type *t0, const Type *t1, SourcePos pos, const char
 }
 
 
-bool
-Type::Equal(const Type *a, const Type *b) {
+static bool
+lCheckTypeEquality(const Type *a, const Type *b, bool ignoreConst) {
     if (a == NULL || b == NULL)
         return false;
 
+    if (ignoreConst == true) {
+        if (dynamic_cast<const FunctionType *>(a) == NULL)
+            a = a->GetAsNonConstType();
+        if (dynamic_cast<const FunctionType *>(b) == NULL)
+            b = b->GetAsNonConstType();
+    }
+
     // We can compare AtomicTypes with pointer equality, since the
     // AtomicType constructor is private so that there isonly the single
     // canonical instance of the AtomicTypes (AtomicType::UniformInt32,
@@ -2346,13 +2429,15 @@ Type::Equal(const Type *a, const Type *b) {
     const ArrayType *atb = dynamic_cast<const ArrayType *>(b);
     if (ata != NULL && atb != NULL)
         return (ata->GetElementCount() == atb->GetElementCount() && 
-                Equal(ata->GetElementType(), atb->GetElementType()));
+                lCheckTypeEquality(ata->GetElementType(), atb->GetElementType(), 
+                                   ignoreConst));
 
     const VectorType *vta = dynamic_cast<const VectorType *>(a);
     const VectorType *vtb = dynamic_cast<const VectorType *>(b);
     if (vta != NULL && vtb != NULL)
         return (vta->GetElementCount() == vtb->GetElementCount() && 
-                Equal(vta->GetElementType(), vtb->GetElementType()));
+                lCheckTypeEquality(vta->GetElementType(), vtb->GetElementType(),
+                                   ignoreConst));
 
     const StructType *sta = dynamic_cast<const StructType *>(a);
     const StructType *stb = dynamic_cast<const StructType *>(b);
@@ -2360,7 +2445,8 @@ Type::Equal(const Type *a, const Type *b) {
         if (sta->GetElementCount() != stb->GetElementCount())
             return false;
         for (int i = 0; i < sta->GetElementCount(); ++i)
-            if (!Equal(sta->GetElementType(i), stb->GetElementType(i)))
+            if (!lCheckTypeEquality(sta->GetElementType(i), stb->GetElementType(i),
+                                    ignoreConst))
                 return false;
         return true;
     }
@@ -2368,16 +2454,16 @@ Type::Equal(const Type *a, const Type *b) {
     const ReferenceType *rta = dynamic_cast<const ReferenceType *>(a);
     const ReferenceType *rtb = dynamic_cast<const ReferenceType *>(b);
     if (rta != NULL && rtb != NULL)
-        return ((rta->IsConstType() == rtb->IsConstType()) &&
-                Type::Equal(rta->GetReferenceTarget(),
-                            rtb->GetReferenceTarget()));
+        return (lCheckTypeEquality(rta->GetReferenceTarget(),
+                                   rtb->GetReferenceTarget(), ignoreConst));
 
     const FunctionType *fta = dynamic_cast<const FunctionType *>(a);
     const FunctionType *ftb = dynamic_cast<const FunctionType *>(b);
     if (fta != NULL && ftb != NULL) {
         // Both the return types and all of the argument types must match
         // for function types to match
-        if (!Equal(fta->GetReturnType(), ftb->GetReturnType()))
+        if (!lCheckTypeEquality(fta->GetReturnType(), ftb->GetReturnType(), 
+                                ignoreConst))
             return false;
 
         if (fta->isTask != ftb->isTask ||
@@ -2385,13 +2471,14 @@ Type::Equal(const Type *a, const Type *b) {
             fta->isExternC != ftb->isExternC)
             return false;
 
-        const std::vector<const Type *> &aargs = fta->GetArgumentTypes();
-        const std::vector<const Type *> &bargs = ftb->GetArgumentTypes();
-        if (aargs.size() != bargs.size())
+        if (fta->GetNumParameters() != ftb->GetNumParameters())
             return false;
-        for (unsigned int i = 0; i < aargs.size(); ++i)
-            if (!Equal(aargs[i], bargs[i]))
+
+        for (int i = 0; i < fta->GetNumParameters(); ++i)
+            if (!lCheckTypeEquality(fta->GetParameterType(i),
+                       ftb->GetParameterType(i), ignoreConst))
                 return false;
+
         return true;
     }
 
@@ -2400,7 +2487,20 @@ Type::Equal(const Type *a, const Type *b) {
     if (pta != NULL && ptb != NULL)
         return (pta->IsConstType() == ptb->IsConstType() &&
                 pta->IsUniformType() == ptb->IsUniformType() &&
-                Type::Equal(pta->GetBaseType(), ptb->GetBaseType()));
+                lCheckTypeEquality(pta->GetBaseType(), ptb->GetBaseType(), 
+                                   ignoreConst));
 
     return false;
 }
+
+
+bool
+Type::Equal(const Type *a, const Type *b) {
+    return lCheckTypeEquality(a, b, false);
+}
+
+
+bool
+Type::EqualIgnoringConst(const Type *a, const Type *b) {
+    return lCheckTypeEquality(a, b, true);
+}
diff --git a/type.h b/type.h
index 672ae87f..751e6373 100644
--- a/type.h
+++ b/type.h
@@ -143,6 +143,10 @@ public:
         the same, false otherwise. */
     static bool Equal(const Type *a, const Type *b);
 
+    /** Checks two types for equality.  Returns true if they are exactly
+        the same (ignoring const-ness of the type), false otherwise. */
+    static bool EqualIgnoringConst(const Type *a, const Type *b);
+
     /** Given two types, returns the least general Type that is more general
         than both of them.  (i.e. that can represent their values without
         any loss of data.)  If there is no such Type, return NULL.
@@ -306,6 +310,14 @@ class PointerType : public Type {
 public:
     PointerType(const Type *t, bool isUniform, bool isConst);
 
+    /** Helper method to return a uniform pointer to the given type. */
+    static PointerType *GetUniform(const Type *t);
+    /** Helper method to return a varying pointer to the given type. */
+    static PointerType *GetVarying(const Type *t);
+
+    /** Returns true if the given type is a void * type. */
+    static bool IsVoidPointer(const Type *t);
+
     bool IsUniformType() const;
     bool IsBoolType() const;
     bool IsFloatType() const;
@@ -429,6 +441,12 @@ public:
         length. */
     virtual ArrayType *GetSizedArray(int length) const;
 
+    /** If the given type is a (possibly multi-dimensional) array type and
+        the initializer expression is an expression list, set the size of
+        any array dimensions that are unsized according to the number of
+        elements in the corresponding sectoin of the initializer
+        expression.
+     */ 
     static const Type *SizeUnsizedArrays(const Type *type, Expr *initExpr);
 
 private:
@@ -633,7 +651,7 @@ private:
  */
 class ReferenceType : public Type {
 public:
-    ReferenceType(const Type *targetType, bool isConst);
+    ReferenceType(const Type *targetType);
 
     bool IsUniformType() const;
     bool IsBoolType() const;
@@ -658,7 +676,6 @@ public:
     llvm::DIType GetDIType(llvm::DIDescriptor scope) const;
 
 private:
-    const bool isConst;
     const Type * const targetType;
 };
 
@@ -715,12 +732,11 @@ public:
     LLVM_TYPE_CONST llvm::FunctionType *LLVMFunctionType(llvm::LLVMContext *ctx, 
                                                          bool includeMask = false) const;
 
-    int GetNumParameters() const { return (int)argTypes.size(); }
-    const std::vector<const Type *> &GetArgumentTypes() const { return argTypes; }
-    const std::vector<ConstExpr *> &GetArgumentDefaults() const { return argDefaults; }
-    const std::vector<SourcePos> &GetArgumentSourcePos() const { return argPos; }
-
-    std::string GetArgumentName(int i) const;
+    int GetNumParameters() const { return (int)paramTypes.size(); }
+    const Type *GetParameterType(int i) const;
+    ConstExpr * GetParameterDefault(int i) const;
+    const SourcePos &GetParameterSourcePos(int i) const;
+    const std::string &GetParameterName(int i) const;
 
     /** This value is true if the function had a 'task' qualifier in the
         source program. */
@@ -736,19 +752,18 @@ public:
 
 private:
     const Type * const returnType;
-    const std::vector<const Type *> argTypes;
-    const std::vector<std::string> argNames;
-    /** Default values of the functions arguments.  For arguments without
-        default values provided, NULL is stored; this means that the length
-        of this array is the same as the argTypes member, and the i'th
-        elements of them correspond with each other. */
-    mutable std::vector<ConstExpr *> argDefaults;
+    // The following four vectors should all have the same length (which is
+    // in turn the length returned by GetNumParameters()).
+    const std::vector<const Type *> paramTypes;
+    const std::vector<std::string> paramNames;
+    /** Default values of the function's arguments.  For arguments without
+        default values provided, NULL is stored. */
+    mutable std::vector<ConstExpr *> paramDefaults;
     /** The names provided (if any) with the function arguments in the
         function's signature.  These should only be used for error messages
-        and the like and shouldn't affect testing function types for
-        equality, etc. */
-    const std::vector<SourcePos> argPos;
-    const SourcePos pos;
+        and the like and so not affect testing function types for equality,
+        etc. */
+    const std::vector<SourcePos> paramPositions;
 };
 
 #endif // ISPC_TYPE_H