Add support for pointers to the language.

Pointers can be either uniform or varying, and behave correspondingly. e.g.: "uniform float * varying" is a varying pointer to uniform float data in memory, and "float * uniform" is a uniform pointer to varying data in memory. Like other types, pointers are varying by default. Pointer-based expressions, & and *, sizeof, ->, pointer arithmetic, and the array/pointer duality all bahave as in C. Array arguments to functions are converted to pointers, also like C. There is a built-in NULL for a null pointer value; conversion from compile-time constant 0 values to NULL still needs to be implemented. Other changes: - Syntax for references has been updated to be C++ style; a useful warning is now issued if the "reference" keyword is used. - It is now illegal to pass a varying lvalue as a reference parameter to a function; references are essentially uniform pointers. This case had previously been handled via special case call by value return code. That path has been removed, now that varying pointers are available to handle this use case (and much more). - Some stdlib routines have been updated to take pointers as arguments where appropriate (e.g. prefetch and the atomics). A number of others still need attention. - All of the examples have been updated - Many new tests TODO: documentation
2011-11-21 09:16:29 -08:00
parent 15a7d353ab
commit 975db80ef6
191 changed files with 4746 additions and 3225 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -114,61 +114,39 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
    // pointers to uniform
    else if (t == LLVMTypes::Int8PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt8 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt8 :
-                                                 AtomicType::UniformInt8, false);
+                                       AtomicType::UniformInt8);
    else if (t == LLVMTypes::Int16PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt16 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt16 :
-                                                 AtomicType::UniformInt16, false);
+                                       AtomicType::UniformInt16);
    else if (t == LLVMTypes::Int32PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt32 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt32 :
-                                                 AtomicType::UniformInt32, false);
+                                       AtomicType::UniformInt32);
    else if (t == LLVMTypes::Int64PointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::UniformUInt64 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::UniformUInt64 :
-                                                 AtomicType::UniformInt64, false);
+                                       AtomicType::UniformInt64);
    else if (t == LLVMTypes::FloatPointerType)
-        return new ReferenceType(AtomicType::UniformFloat, false);
+        return PointerType::GetUniform(AtomicType::UniformFloat);
    else if (t == LLVMTypes::DoublePointerType)
-        return new ReferenceType(AtomicType::UniformDouble, false);
+        return PointerType::GetUniform(AtomicType::UniformDouble);
    // pointers to varying
    else if (t == LLVMTypes::Int8VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt8 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt8 :
-                                                 AtomicType::VaryingInt8, false);
+                                       AtomicType::VaryingInt8);
    else if (t == LLVMTypes::Int16VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt16 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt16 :
-                                                 AtomicType::VaryingInt16, false);
+                                       AtomicType::VaryingInt16);
    else if (t == LLVMTypes::Int32VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt32 :
-                                                 AtomicType::VaryingInt32, false);
+                                       AtomicType::VaryingInt32);
    else if (t == LLVMTypes::Int64VectorPointerType)
-        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt64 :
+        return PointerType::GetUniform(intAsUnsigned ? AtomicType::VaryingUInt64 :
-                                                 AtomicType::VaryingInt64, false);
+                                       AtomicType::VaryingInt64);
    else if (t == LLVMTypes::FloatVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingFloat, false);
+        return PointerType::GetUniform(AtomicType::VaryingFloat);
    else if (t == LLVMTypes::DoubleVectorPointerType)
-        return new ReferenceType(AtomicType::VaryingDouble, false);
+        return PointerType::GetUniform(AtomicType::VaryingDouble);
    // arrays
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
        // Is it a pointer to an unsized array of objects?  If so, then
        // create the equivalent ispc type.  Note that it has to be a
        // reference to an array, since ispc passes arrays to functions by
        // reference.
        const llvm::ArrayType *at = 
            llvm::dyn_cast<const llvm::ArrayType>(pt->getElementType());
        if (at != NULL) {
            const Type *eltType = lLLVMTypeToISPCType(at->getElementType(),
                                                      intAsUnsigned);
            if (eltType == NULL)
                return NULL;
            // FIXME: this needs to be fixed when arrays can have 
            // over 4G elements...
            return new ReferenceType(new ArrayType(eltType, (int)at->getNumElements()),
                                     false);
        }
    }
    return NULL;
 }
@@ -184,6 +162,9 @@ lCreateSymbol(const std::string &name, const Type *returnType,
    FunctionType *funcType = new FunctionType(returnType, argTypes, noPos);
    Debug(noPos, "Created builtin symbol \"%s\" [%s]\n", name.c_str(),
          funcType->GetString().c_str());
    Symbol *sym = new Symbol(name, noPos, funcType);
    sym->function = func;
    symbolTable->AddFunction(sym);
@@ -244,7 +225,7 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // Iterate over the arguments and try to find their equivalent ispc
        // types.  Track if any of the arguments has an integer type.
-        bool anyIntArgs = false, anyReferenceArgs = false;
+        bool anyIntArgs = false;
        std::vector<const Type *> argTypes;
        for (unsigned int j = 0; j < ftype->getNumParams(); ++j) {
            const llvm::Type *llvmArgType = ftype->getParamType(j);
@@ -256,7 +237,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
            }
            anyIntArgs |= 
                (Type::Equal(type, lLLVMTypeToISPCType(llvmArgType, !intAsUnsigned)) == false);
            anyReferenceArgs |= (dynamic_cast<const ReferenceType *>(type) != NULL);
            argTypes.push_back(type);
        }
@@ -264,19 +244,6 @@ lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) {
        // so that we get symbols for things with no integer types!
        if (i == 0 || anyIntArgs == true)
            lCreateSymbol(name, returnType, argTypes, ftype, func, symbolTable);
        // If there are any reference types, also make a variant of the
        // symbol that has them as const references.  This obviously
        // doesn't make sense for many builtins, but we'll give the stdlib
        // the option to call one if it needs one.
        if (anyReferenceArgs == true) {
            for (unsigned int j = 0; j < argTypes.size(); ++j) {
                if (dynamic_cast<const ReferenceType *>(argTypes[j]) != NULL)
                    argTypes[j] = argTypes[j]->GetAsConstType();
                lCreateSymbol(name + "_refsconst", returnType, argTypes, 
                              ftype, func, symbolTable);
            }
        }
    }
    return true;
@@ -476,62 +443,10 @@ lSetInternalFunctions(llvm::Module *module) {
        "__packed_store_active",
        "__popcnt_int32",
        "__popcnt_int64",
-        "__prefetch_read_1_uniform_bool",
+        "__prefetch_read_uniform_1",
-        "__prefetch_read_1_uniform_double",
+        "__prefetch_read_uniform_2",
-        "__prefetch_read_1_uniform_float",
+        "__prefetch_read_uniform_3",
-        "__prefetch_read_1_uniform_int16",
+        "__prefetch_read_uniform_nt",
        "__prefetch_read_1_uniform_int32",
        "__prefetch_read_1_uniform_int64",
        "__prefetch_read_1_uniform_int8",
        "__prefetch_read_1_varying_bool",
        "__prefetch_read_1_varying_double",
        "__prefetch_read_1_varying_float",
        "__prefetch_read_1_varying_int16",
        "__prefetch_read_1_varying_int32",
        "__prefetch_read_1_varying_int64",
        "__prefetch_read_1_varying_int8",
        "__prefetch_read_2_uniform_bool",
        "__prefetch_read_2_uniform_double",
        "__prefetch_read_2_uniform_float",
        "__prefetch_read_2_uniform_int16",
        "__prefetch_read_2_uniform_int32",
        "__prefetch_read_2_uniform_int64",
        "__prefetch_read_2_uniform_int8",
        "__prefetch_read_2_varying_bool",
        "__prefetch_read_2_varying_double",
        "__prefetch_read_2_varying_float",
        "__prefetch_read_2_varying_int16",
        "__prefetch_read_2_varying_int32",
        "__prefetch_read_2_varying_int64",
        "__prefetch_read_2_varying_int8",
        "__prefetch_read_3_uniform_bool",
        "__prefetch_read_3_uniform_double",
        "__prefetch_read_3_uniform_float",
        "__prefetch_read_3_uniform_int16",
        "__prefetch_read_3_uniform_int32",
        "__prefetch_read_3_uniform_int64",
        "__prefetch_read_3_uniform_int8",
        "__prefetch_read_3_varying_bool",
        "__prefetch_read_3_varying_double",
        "__prefetch_read_3_varying_float",
        "__prefetch_read_3_varying_int16",
        "__prefetch_read_3_varying_int32",
        "__prefetch_read_3_varying_int64",
        "__prefetch_read_3_varying_int8",
        "__prefetch_read_nt_uniform_bool",
        "__prefetch_read_nt_uniform_double",
        "__prefetch_read_nt_uniform_float",
        "__prefetch_read_nt_uniform_int16",
        "__prefetch_read_nt_uniform_int32",
        "__prefetch_read_nt_uniform_int64",
        "__prefetch_read_nt_uniform_int8",
        "__prefetch_read_nt_varying_bool",
        "__prefetch_read_nt_varying_double",
        "__prefetch_read_nt_varying_float",
        "__prefetch_read_nt_varying_int16",
        "__prefetch_read_nt_varying_int32",
        "__prefetch_read_nt_varying_int64",
        "__prefetch_read_nt_varying_int8",
        "__rcp_uniform_float",
        "__rcp_varying_float",
        "__reduce_add_double",
@@ -747,7 +662,7 @@ void
 DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module,
             bool includeStdlibISPC) {
    // Add the definitions from the compiled builtins-c.c file
-    if (g->target.is32bit) {
+    if (g->target.is32Bit) {
        extern unsigned char builtins_bitcode_c_32[];
        extern int builtins_bitcode_c_32_length;
        AddBitcodeToModule(builtins_bitcode_c_32, builtins_bitcode_c_32_length, 
--- a/builtins.m4
+++ b/builtins.m4
@@ -822,40 +822,6 @@ define $2 @__atomic_compare_exchange_uniform_$3_global($2* %ptr, $2 %cmp,
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetch definitions
 ; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
 ; and data caches--the declaration is now:
 ; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
 ;                             i32 %cachetype)  (cachetype 1 == data cache)
 ; however, the version below seems to still work...
 declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
 define(`prefetch_read', `
 define void @__prefetch_read_1_$1($2 *) alwaysinline {
  %ptr8 = bitcast $2 * %0 to i8 *
  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 3)
  ret void
 }
 define void @__prefetch_read_2_$1($2 *) alwaysinline {
  %ptr8 = bitcast $2 * %0 to i8 *
  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 2)
  ret void
 }
 define void @__prefetch_read_3_$1($2 *) alwaysinline {
  %ptr8 = bitcast $2 * %0 to i8 *
  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 1)
  ret void
 }
 define void @__prefetch_read_nt_$1($2 *) alwaysinline {
  %ptr8 = bitcast $2 * %0 to i8 *
  call void @llvm.prefetch(i8 * %ptr8, i32 0, i32 0)
  ret void
 }
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 define(`stdlib_core', `
@@ -916,15 +882,25 @@ declare void @__pseudo_masked_store_64(<$1 x i64> * nocapture, <$1 x i64>, <$1 x
 ; converts them to native gather functions or converts them to vector
 ; loads, if equivalent.
-declare <$1 x i8>  @__pseudo_gather_8([$1 x i8 *], <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather32_8(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_16([$1 x i8 *], <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather32_16(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_32([$1 x i8 *], <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather32_32(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_64([$1 x i8 *], <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather32_64(<$1 x i32>, <$1 x i32>) nounwind readonly
-declare <$1 x i8>  @__pseudo_gather_base_offsets_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i8>  @__pseudo_gather64_8(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i16> @__pseudo_gather_base_offsets_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i16> @__pseudo_gather64_16(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i32> @__pseudo_gather_base_offsets_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i32> @__pseudo_gather64_32(<$1 x i64>, <$1 x i32>) nounwind readonly
-declare <$1 x i64> @__pseudo_gather_base_offsets_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
+declare <$1 x i64> @__pseudo_gather64_64(<$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i8>  @__pseudo_gather_base_offsets32_8(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
 declare <$1 x i16> @__pseudo_gather_base_offsets32_16(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
 declare <$1 x i32> @__pseudo_gather_base_offsets32_32(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
 declare <$1 x i64> @__pseudo_gather_base_offsets32_64(i8 *, <$1 x i32>, <$1 x i32>) nounwind readonly
 declare <$1 x i8>  @__pseudo_gather_base_offsets64_8(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i16> @__pseudo_gather_base_offsets64_16(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i32> @__pseudo_gather_base_offsets64_32(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
 declare <$1 x i64> @__pseudo_gather_base_offsets64_64(i8 *, <$1 x i64>, <$1 x i32>) nounwind readonly
 ; Similarly to the pseudo-gathers defined above, we also declare undefined
 ; pseudo-scatter instructions with signatures:
@@ -949,18 +925,32 @@ declare <$1 x i64> @__pseudo_gather_base_offsets_64(i8 *, <$1 x i32>, <$1 x i32>
 ; And the GSImprovementsPass in turn converts these to actual native
 ; scatters or masked stores.  
-declare void @__pseudo_scatter_8([$1 x i8 *], <$1 x i8>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_8(<$1 x i32>, <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_16([$1 x i8 *], <$1 x i16>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_16(<$1 x i32>, <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_32([$1 x i8 *], <$1 x i32>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_32(<$1 x i32>, <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_64([$1 x i8 *], <$1 x i64>, <$1 x i32>) nounwind
+declare void @__pseudo_scatter32_64(<$1 x i32>, <$1 x i64>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_8(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter64_8(<$1 x i64>, <$1 x i8>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_16(<$1 x i64>, <$1 x i16>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_32(<$1 x i64>, <$1 x i32>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter64_64(<$1 x i64>, <$1 x i64>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, <$1 x i32>,
                                                <$1 x i8>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_16(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, <$1 x i32>,
                                                 <$1 x i16>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_32(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, <$1 x i32>,
                                                 <$1 x i32>, <$1 x i32>) nounwind
-declare void @__pseudo_scatter_base_offsets_64(i8 * nocapture, <$1 x i32>,
+declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, <$1 x i32>,
                                                 <$1 x i64>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, <$1 x i64>,
                                                <$1 x i8>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, <$1 x i64>,
                                                 <$1 x i16>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, <$1 x i64>,
                                                 <$1 x i32>, <$1 x i32>) nounwind
 declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, <$1 x i64>,
                                                 <$1 x i64>, <$1 x i32>) nounwind
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1634,11 +1624,10 @@ define void
 ;; versions to be called from stdlib
 define void
-@__aos_to_soa4_float([0 x float] * noalias %base, i32 %offset,
+@__aos_to_soa4_float(float * noalias %pf, i32 %offset,
        <$1 x float> * noalias %out0, <$1 x float> * noalias %out1,
        <$1 x float> * noalias %out2, <$1 x float> * noalias %out3)
        nounwind alwaysinline { 
  %pf = bitcast [0 x float] * %base to float *
  %p = getelementptr float * %pf, i32 %offset
  %p0 = bitcast float * %p to <$1 x float> *
  %v0 = load <$1 x float> * %p0, align 4
@@ -1656,16 +1645,16 @@ define void
 define void
-@__aos_to_soa4_int32([0 x i32] * noalias %base, i32 %offset,
+@__aos_to_soa4_int32(i32 * noalias %base, i32 %offset,
        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
        <$1 x i32> * noalias %out2, <$1 x i32> * noalias %out3)
        nounwind alwaysinline { 
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
  %fout3 = bitcast <$1 x i32> * %out3 to <$1 x float> *
-  call void @__aos_to_soa4_float([0 x float] * %fbase, i32 %offset,
+  call void @__aos_to_soa4_float(float * %fbase, i32 %offset,
      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2, 
      <$1 x float> * %fout3)
  ret void
@@ -1674,9 +1663,8 @@ define void
 define void
@__soa_to_aos4_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             <$1 x float> %v3, [0 x float] * noalias %base,
+             <$1 x float> %v3, float * noalias %pf,
             i32 %offset) nounwind alwaysinline { 
  %pf = bitcast [0 x float] * %base to float *
  %p = getelementptr float * %pf, i32 %offset
  %out0 = bitcast float * %p to <$1 x float> *
  %out1 = getelementptr <$1 x float> * %out0, i32 1
@@ -1691,25 +1679,24 @@ define void
 define void
@__soa_to_aos4_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             <$1 x i32> %v3, [0 x i32] * noalias %base,
+             <$1 x i32> %v3, i32 * noalias %base,
             i32 %offset) nounwind alwaysinline { 
  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
  %fv3 = bitcast <$1 x i32> %v3 to <$1 x float>
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
  call void @__soa_to_aos4_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, <$1 x float> %fv3, [0 x float] * %fbase,
+      <$1 x float> %fv2, <$1 x float> %fv3, float * %fbase,
      i32 %offset)
  ret void
 }
 define void
-@__aos_to_soa3_float([0 x float] * noalias %base, i32 %offset,
+@__aos_to_soa3_float(float * noalias %pf, i32 %offset,
        <$1 x float> * %out0, <$1 x float> * %out1,
        <$1 x float> * %out2) nounwind alwaysinline { 
  %pf = bitcast [0 x float] * %base to float *
  %p = getelementptr float * %pf, i32 %offset
  %p0 = bitcast float * %p to <$1 x float> *
  %v0 = load <$1 x float> * %p0, align 4
@@ -1725,14 +1712,14 @@ define void
 define void
-@__aos_to_soa3_int32([0 x i32] * noalias %base, i32 %offset,
+@__aos_to_soa3_int32(i32 * noalias %base, i32 %offset,
        <$1 x i32> * noalias %out0, <$1 x i32> * noalias %out1,
        <$1 x i32> * noalias %out2) nounwind alwaysinline { 
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
  %fout0 = bitcast <$1 x i32> * %out0 to <$1 x float> *
  %fout1 = bitcast <$1 x i32> * %out1 to <$1 x float> *
  %fout2 = bitcast <$1 x i32> * %out2 to <$1 x float> *
-  call void @__aos_to_soa3_float([0 x float] * %fbase, i32 %offset,
+  call void @__aos_to_soa3_float(float * %fbase, i32 %offset,
      <$1 x float> * %fout0, <$1 x float> * %fout1, <$1 x float> * %fout2)
  ret void
 }
@@ -1740,8 +1727,7 @@ define void
 define void
@__soa_to_aos3_float(<$1 x float> %v0, <$1 x float> %v1, <$1 x float> %v2,
-             [0 x float] * noalias %base, i32 %offset) nounwind alwaysinline { 
+                     float * noalias %pf, i32 %offset) nounwind alwaysinline { 
  %pf = bitcast [0 x float] * %base to float *
  %p = getelementptr float * %pf, i32 %offset
  %out0 = bitcast float * %p to <$1 x float> *
  %out1 = getelementptr <$1 x float> * %out0, i32 1
@@ -1755,13 +1741,13 @@ define void
 define void
@__soa_to_aos3_int32(<$1 x i32> %v0, <$1 x i32> %v1, <$1 x i32> %v2,
-             [0 x i32] * noalias %base, i32 %offset) nounwind alwaysinline { 
+                     i32 * noalias %base, i32 %offset) nounwind alwaysinline { 
  %fv0 = bitcast <$1 x i32> %v0 to <$1 x float>
  %fv1 = bitcast <$1 x i32> %v1 to <$1 x float>
  %fv2 = bitcast <$1 x i32> %v2 to <$1 x float>
-  %fbase = bitcast [0 x i32] * %base to [0 x float] *
+  %fbase = bitcast i32 * %base to float *
  call void @__soa_to_aos3_float(<$1 x float> %fv0, <$1 x float> %fv1, 
-      <$1 x float> %fv2, [0 x float] * %fbase, i32 %offset)
+      <$1 x float> %fv2, float * %fbase, i32 %offset)
  ret void
 }
@@ -1769,21 +1755,34 @@ define void
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; prefetching
-prefetch_read(uniform_bool, i1)
+; prefetch has a new parameter in LLVM3.0, to distinguish between instruction
-prefetch_read(uniform_int8, i8)
+; and data caches--the declaration is now:
-prefetch_read(uniform_int16, i16)
+; declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality,
-prefetch_read(uniform_int32, i32)
+;                             i32 %cachetype)  (cachetype 1 == data cache)
-prefetch_read(uniform_int64, i64)
+; however, the version below seems to still work...
-prefetch_read(uniform_float, float)
+
-prefetch_read(uniform_double, double)
+declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality)
 define void @__prefetch_read_uniform_1(i8 *) alwaysinline {
  call void @llvm.prefetch(i8 * %0, i32 0, i32 3)
  ret void
 }
 define void @__prefetch_read_uniform_2(i8 *) alwaysinline {
  call void @llvm.prefetch(i8 * %0, i32 0, i32 2)
  ret void
 }
 define void @__prefetch_read_uniform_3(i8 *) alwaysinline {
  call void @llvm.prefetch(i8 * %0, i32 0, i32 1)
  ret void
 }
 define void @__prefetch_read_uniform_nt(i8 *) alwaysinline {
  call void @llvm.prefetch(i8 * %0, i32 0, i32 0)
  ret void
 }
 prefetch_read(varying_bool, <$1 x i32>)
 prefetch_read(varying_int8, <$1 x i8>)
 prefetch_read(varying_int16, <$1 x i16>)
 prefetch_read(varying_int32, <$1 x i32>)
 prefetch_read(varying_int64, <$1 x i64>)
 prefetch_read(varying_float, <$1 x float>)
 prefetch_read(varying_double, <$1 x double>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; assert
@@ -2354,11 +2353,10 @@ define void @__masked_store_blend_16(<16 x i16>* nocapture, <16 x i16>,
 define(`packed_load_and_store', `
-define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
+define i32 @__packed_load_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> * %val_ptr,
                                 <$1 x i32> %full_mask) nounwind alwaysinline {
 entry:
  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
  %baseptr = bitcast [0 x i32] * %0 to i32 *
  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
  br i1 %mask_known, label %known_mask, label %unknown_mask
@@ -2410,11 +2408,10 @@ done:
  ret i32 %nextoffset
 }
-define i32 @__packed_store_active([0 x i32] *, i32 %start_offset, <$1 x i32> %vals,
+define i32 @__packed_store_active(i32 * %baseptr, i32 %start_offset, <$1 x i32> %vals,
                                  <$1 x i32> %full_mask) nounwind alwaysinline {
 entry:
  %mask = call i32 @__movmsk(<$1 x i32> %full_mask)
  %baseptr = bitcast [0 x i32] * %0 to i32 *
  %startptr = getelementptr i32 * %baseptr, i32 %start_offset
  %mask_known = call i1 @__is_compile_time_constant_mask(<$1 x i32> %full_mask)
  br i1 %mask_known, label %known_mask, label %unknown_mask
@@ -2686,7 +2683,7 @@ pl_done:
 define(`gen_gather', `
 ;; Define the utility function to do the gather operation for a single element
 ;; of the type
-define <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
+define <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret,
                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
@@ -2699,8 +2696,21 @@ define <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %offsets, <$1 x $2> %ret
  ret <$1 x $2> %updatedret
 }
 define <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %offsets, <$1 x $2> %ret,
                                    i32 %lane) nounwind readonly alwaysinline {
  ; compute address for this one from the base
  %offset32 = extractelement <$1 x i64> %offsets, i32 %lane
  %ptroffset = getelementptr i8 * %ptr, i64 %offset32
  %ptrcast = bitcast i8 * %ptroffset to $2 *
-define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
+  ; load value and insert into returned value
  %val = load $2 *%ptrcast
  %updatedret = insertelement <$1 x $2> %ret, $2 %val, i32 %lane
  ret <$1 x $2> %updatedret
 }
 define <$1 x $2> @__gather_base_offsets32_$2(i8 * %ptr, <$1 x i32> %offsets,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
@@ -2713,14 +2723,68 @@ define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i32> * %offsetsPtr
-  %ret0 = call <$1 x $2> @__gather_elt_$2(i8 * %ptr, <$1 x i32> %newOffsets,
+  %ret0 = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, <$1 x i32> %newOffsets,
                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
-          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt_$2(i8 * %ptr, 
+          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt32_$2(i8 * %ptr, 
                                <$1 x i32> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
 define <$1 x $2> @__gather_base_offsets64_$2(i8 * %ptr, <$1 x i64> %offsets,
                                             <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  ; We can be clever and avoid the per-lane stuff for gathers if we are willing
  ; to require that the 0th element of the array being gathered from is always
  ; legal to read from (and we do indeed require that, given the benefits!) 
  ;
  ; Set the offset to zero for lanes that are off
  %offsetsPtr = alloca <$1 x i64>
  store <$1 x i64> zeroinitializer, <$1 x i64> * %offsetsPtr
  call void @__masked_store_blend_64(<$1 x i64> * %offsetsPtr, <$1 x i64> %offsets, 
                                     <$1 x i32> %vecmask)
  %newOffsets = load <$1 x i64> * %offsetsPtr
  %ret0 = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, <$1 x i64> %newOffsets,
                                            <$1 x $2> undef, i32 0)
  forloop(lane, 1, eval($1-1), 
          `patsubst(patsubst(`%retLANE = call <$1 x $2> @__gather_elt64_$2(i8 * %ptr, 
                                <$1 x i64> %newOffsets, <$1 x $2> %retPREV, i32 LANE)
                    ', `LANE', lane), `PREV', eval(lane-1))')
  ret <$1 x $2> %ret`'eval($1-1)
 }
 ; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
 define <$1 x $2> @__gather32_$2(<$1 x i32> %ptrs, 
                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  %ret_ptr = alloca <$1 x $2>
  per_lane($1, <$1 x i32> %vecmask, `
  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
  %val_ID = load $2 * %ptr_ID
  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
  store $2 %val_ID, $2 * %store_ptr_ID
 ')
  %ret = load <$1 x $2> * %ret_ptr
  ret <$1 x $2> %ret
 }
 ; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
 define <$1 x $2> @__gather64_$2(<$1 x i64> %ptrs, 
                                <$1 x i32> %vecmask) nounwind readonly alwaysinline {
  %ret_ptr = alloca <$1 x $2>
  per_lane($1, <$1 x i32> %vecmask, `
  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
  %val_ID = load $2 * %ptr_ID
  %store_ptr_ID = getelementptr <$1 x $2> * %ret_ptr, i32 0, i32 LANE
  store $2 %val_ID, $2 * %store_ptr_ID
 ')
  %ret = load <$1 x $2> * %ret_ptr
  ret <$1 x $2> %ret
 }
 '
 )
@@ -2735,7 +2799,7 @@ define <$1 x $2> @__gather_base_offsets_$2(i8 * %ptr, <$1 x i32> %offsets,
 define(`gen_scatter', `
 ;; Define the function that descripes the work to do to scatter a single
 ;; value
-define void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
+define void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values,
                                i32 %lane) nounwind alwaysinline {
  %offset32 = extractelement <$1 x i32> %offsets, i32 %lane
  %offset64 = zext i32 %offset32 to i64
@@ -2746,13 +2810,57 @@ define void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values
  ret void
 }
-define void @__scatter_base_offsets_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
+define void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values,
                                i32 %lane) nounwind alwaysinline {
  %offset64 = extractelement <$1 x i64> %offsets, i32 %lane
  %ptrdelta = add i64 %ptr64, %offset64
  %ptr = inttoptr i64 %ptrdelta to $2 *
  %storeval = extractelement <$1 x $2> %values, i32 %lane
  store $2 %storeval, $2 * %ptr
  ret void
 }
 define void @__scatter_base_offsets32_$2(i8* %base, <$1 x i32> %offsets, <$1 x $2> %values,
                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  %ptr64 = ptrtoint i8 * %base to i64
  per_lane($1, <$1 x i32> %mask, `
-      call void @__scatter_elt_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
+      call void @__scatter_elt32_$2(i64 %ptr64, <$1 x i32> %offsets, <$1 x $2> %values, i32 LANE)')
  ret void
 }
 define void @__scatter_base_offsets64_$2(i8* %base, <$1 x i64> %offsets, <$1 x $2> %values,
                                         <$1 x i32> %mask) nounwind alwaysinline {
  ;; And use the `per_lane' macro to do all of the per-lane work for scatter...
  %ptr64 = ptrtoint i8 * %base to i64
  per_lane($1, <$1 x i32> %mask, `
      call void @__scatter_elt64_$2(i64 %ptr64, <$1 x i64> %offsets, <$1 x $2> %values, i32 LANE)')
  ret void
 }
 ; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s
 define void @__scatter32_$2(<$1 x i32> %ptrs, <$1 x $2> %values,
                            <$1 x i32> %mask) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %mask, `
  %iptr_ID = extractelement <$1 x i32> %ptrs, i32 LANE
  %ptr_ID = inttoptr i32 %iptr_ID to $2 *
  %val_ID = extractelement <$1 x $2> %values, i32 LANE
  store $2 %val_ID, $2 * %ptr_ID
 ')
  ret void
 }
 ; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s
 define void @__scatter64_$2(<$1 x i64> %ptrs, <$1 x $2> %values,
                            <$1 x i32> %mask) nounwind alwaysinline {
  per_lane($1, <$1 x i32> %mask, `
  %iptr_ID = extractelement <$1 x i64> %ptrs, i32 LANE
  %ptr_ID = inttoptr i64 %iptr_ID to $2 *
  %val_ID = extractelement <$1 x $2> %values, i32 LANE
  store $2 %val_ID, $2 * %ptr_ID
 ')
  ret void
 }
 '
 )
--- a/ctx.cpp
+++ b/ctx.cpp
--- a/ctx.h
+++ b/ctx.h
@@ -311,20 +311,13 @@ public:
    /** Given a scalar value, return a vector of the same type (or an
        array, for pointer types). */
-    llvm::Value *SmearScalar(llvm::Value *value, const char *name = NULL);
+    llvm::Value *SmearUniform(llvm::Value *value, const char *name = NULL);
    llvm::Value *BitCastInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                             const char *name = NULL);
-    llvm::Value *PtrToIntInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
+    llvm::Value *PtrToIntInst(llvm::Value *value, const char *name = NULL);
                              const char *name = NULL);
    llvm::Value *IntToPtrInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                              const char *name = NULL);
    /** Given a value of some array type, return the corresponding value of
        vector type. */
    llvm::Value *ArrayToVectorInst(llvm::Value *value);
    /** Given a value of some vector type, return the corresponding value of
        array type. */
    llvm::Value *VectorToArrayInst(llvm::Value *value);
    llvm::Instruction *TruncInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type,
                                 const char *name = NULL);
@@ -337,26 +330,37 @@ public:
    llvm::Instruction *ZExtInst(llvm::Value *value, LLVM_TYPE_CONST llvm::Type *type, 
                                const char *name = NULL);
-    /** This GEP method is a generalization of the standard one in LLVM; it
+    /** These GEP methods are generalizations of the standard ones in LLVM;
-        supports both uniform and varying basePtr values (an array of
+        they support both uniform and varying basePtr values as well as
-        pointers) as well as uniform and varying index values (arrays of
+        uniform and varying index values (arrays of indices).  Varying base
-        indices). */
+        pointers are expected to come in as vectors of i32/i64 (depending
        on the target), since LLVM doesn't currently support vectors of
        pointers.  The underlying type of the base pointer must be provided
        via the ptrType parameter */
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index,
                                   const Type *ptrType, const char *name = NULL);
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0,
-                                   llvm::Value *index1, const char *name = NULL);
+                                   llvm::Value *index1, const Type *ptrType,
    /** This is a convenience method to generate a GEP instruction with
        indices with values with known constant values as the ispc program
        is being compiled. */
    llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1,
                                   const char *name = NULL);
    /** This method returns a new pointer that represents offsetting the
        given base pointer to point at the given element number of the
        structure type that the base pointer points to.  (The provided
        pointer must be a pointer to a structure type.  The ptrType gives
        the type of the pointer, though it may be NULL if the base pointer
        is uniform. */
    llvm::Value *AddElementOffset(llvm::Value *basePtr, int elementNum,
                                  const Type *ptrType, const char *name = NULL);
    /** Load from the memory location(s) given by lvalue, using the given
        mask.  The lvalue may be varying, in which case this corresponds to
        a gather from the multiple memory locations given by the array of
        pointer values given by the lvalue.  If the lvalue is not varying,
        then both the mask pointer and the type pointer may be NULL. */
-    llvm::Value *LoadInst(llvm::Value *lvalue, llvm::Value *mask,
+    llvm::Value *LoadInst(llvm::Value *ptr, llvm::Value *mask,
-                          const Type *type, const char *name = NULL);
+                          const Type *ptrType, const char *name = NULL);
    llvm::Value *LoadInst(llvm::Value *ptr, const char *name = NULL);
    /** Emits an alloca instruction to allocate stack storage for the given
        type.  If a non-zero alignment is specified, the object is also
@@ -370,16 +374,14 @@ public:
    /** Standard store instruction; for this variant, the lvalue must be a
        single pointer, not a varying lvalue. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, 
+    void StoreInst(llvm::Value *value, llvm::Value *ptr);
                   const char *name = NULL);
    /** In this variant of StoreInst(), the lvalue may be varying.  If so,
        this corresponds to a scatter.  Whether the lvalue is uniform of
        varying, the given storeMask is used to mask the stores so that
        they only execute for the active program instances. */
-    void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
+    void StoreInst(llvm::Value *value, llvm::Value *ptr,
-                   llvm::Value *storeMask, const Type *rvalueType,
+                   llvm::Value *storeMask, const Type *ptrType);
                   const char *name = NULL);
    void BranchInst(llvm::BasicBlock *block);
    void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock,
@@ -401,20 +403,22 @@ public:
    llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0,
                                  llvm::Value *val1, const char *name = NULL);
-    /** Emits IR to do a function call with the given arguments.  The
+    /** Emits IR to do a function call with the given arguments.  If the
-        function return type must be provided in returnType. */
+        function type is a varying function pointer type, its full type
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+        must be provided in funcType.  funcType can be NULL if func is a
        uniform function pointer. */
    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                          const std::vector<llvm::Value *> &args,
                          const char *name = NULL);
    /** This is a convenience method that issues a call instruction to a
        function that takes just a single argument. */
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                          llvm::Value *arg, const char *name = NULL);
    /** This is a convenience method that issues a call instruction to a
        function that takes two arguments. */
-    llvm::Value *CallInst(llvm::Value *func, const Type *returnType,
+    llvm::Value *CallInst(llvm::Value *func, const FunctionType *funcType,
                          llvm::Value *arg0, llvm::Value *arg1,
                          const char *name = NULL);
@@ -530,15 +534,18 @@ private:
    void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target);
    llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr);
    llvm::Value *applyVaryingGEP(llvm::Value *basePtr, llvm::Value *index, 
                                 const Type *ptrType);
    void restoreMaskGivenReturns(llvm::Value *oldMask);
-    void scatter(llvm::Value *rvalue, llvm::Value *lvalue, 
+    void scatter(llvm::Value *value, llvm::Value *ptr, const Type *ptrType, 
-                 llvm::Value *maskPtr, const Type *rvalueType);
+                 llvm::Value *mask);
-    llvm::Value *gather(llvm::Value *lvalue, llvm::Value *mask,
+    void maskedStore(llvm::Value *value, llvm::Value *ptr, const Type *ptrType,
-                        const Type *type, const char *name);
+                     llvm::Value *mask);
-    void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,
+    llvm::Value *gather(llvm::Value *ptr, const Type *ptrType, llvm::Value *mask,
-                     const Type *rvalueType, llvm::Value *maskPtr);
+                        const char *name);
-    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *value, const Type *type);
+    llvm::Value *addVaryingOffsetsIfNeeded(llvm::Value *ptr, const Type *ptrType);
 };
 #endif // ISPC_CTX_H
--- a/decl.cpp
+++ b/decl.cpp
@@ -46,12 +46,14 @@
 #include <stdio.h>
 #include <llvm/Module.h>
 /** Given a Type and a set of type qualifiers, apply the type qualifiers to
    the type, returning the type that is the result. 
 */
 static const Type *
 lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
    if (type == NULL)
        return NULL;
    // Account for 'unsigned' and 'const' qualifiers in the type
    if ((typeQualifiers & TYPEQUAL_UNSIGNED) != 0) {
        const Type *unsignedType = type->GetAsUnsignedType();
        if (unsignedType != NULL)
@@ -60,11 +62,10 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
            Error(pos, "\"unsigned\" qualifier is illegal with \"%s\" type.",
              type->GetString().c_str());
    }
    if ((typeQualifiers & TYPEQUAL_CONST) != 0)
        type = type->GetAsConstType();
    // if uniform/varying is specified explicitly, then go with that
    if (dynamic_cast<const FunctionType *>(type) == NULL) {
    if ((typeQualifiers & TYPEQUAL_UNIFORM) != 0)
        type = type->GetAsUniformType();
    else if ((typeQualifiers & TYPEQUAL_VARYING) != 0)
@@ -77,7 +78,6 @@ lApplyTypeQualifiers(int typeQualifiers, const Type *type, SourcePos pos) {
        else
            type = type->GetAsVaryingType();
    }
    }
    return type;
 }
@@ -127,7 +127,6 @@ DeclSpecs::Print() const {
    if (typeQualifiers & TYPEQUAL_UNIFORM)   printf("uniform ");
    if (typeQualifiers & TYPEQUAL_VARYING)   printf("varying ");
    if (typeQualifiers & TYPEQUAL_TASK)      printf("task ");
    if (typeQualifiers & TYPEQUAL_REFERENCE) printf("reference ");
    if (typeQualifiers & TYPEQUAL_UNSIGNED)  printf("unsigned ");
    printf("%s", baseType->GetString().c_str());
@@ -161,8 +160,10 @@ Declarator::InitFromDeclSpecs(DeclSpecs *ds) {
 Symbol *
-Declarator::GetSymbol() {
+Declarator::GetSymbol() const {
-    Declarator *d = this;
+    // The symbol lives at the last child in the chain, so walk down there
    // and return the one there.
    const Declarator *d = this;
    while (d->child != NULL)
        d = d->child;
    return d->sym;
@@ -171,7 +172,12 @@ Declarator::GetSymbol() {
 void
 Declarator::Print() const {
    Symbol *sym = GetSymbol();
    if (sym != NULL)
        printf("%s", sym->name.c_str());
    else
        printf("(null symbol)");
    if (initExpr != NULL) {
        printf(" = (");
        initExpr->Print();
@@ -181,28 +187,39 @@ Declarator::Print() const {
 }
-void
+Symbol *
-Declarator::GetFunctionInfo(DeclSpecs *ds, Symbol **funSym, 
+Declarator::GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *funArgs) {
                            std::vector<Symbol *> *funArgs) {
    // Get the symbol for the function from the symbol table.  (It should
    // already have been added to the symbol table by AddGlobal() by the
    // time we get here.)
    const FunctionType *type = 
        dynamic_cast<const FunctionType *>(GetType(ds));
    if (type == NULL)
-        return;
+        return NULL;
    Symbol *declSym = GetSymbol();
    assert(declSym != NULL);
    *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
    if (*funSym != NULL)
        // May be NULL due to error earlier in compilation
        (*funSym)->pos = pos;
-    for (unsigned int i = 0; i < functionArgs.size(); ++i) {
+    // Get the symbol for the function from the symbol table.  (It should
-        Declaration *pdecl = functionArgs[i];
+    // already have been added to the symbol table by AddGlobal() by the
    // time we get here.)
    Symbol *funSym = m->symbolTable->LookupFunction(declSym->name.c_str(), type);
    if (funSym != NULL)
        // May be NULL due to error earlier in compilation
        funSym->pos = pos;
    // Walk down to the declarator for the function.  (We have to get past
    // the stuff that specifies the function's return type before we get to
    // the function's declarator.)
    Declarator *d = this;
    while (d != NULL && d->kind != DK_FUNCTION)
        d = d->child;
    assert(d != NULL);
    for (unsigned int i = 0; i < d->functionParams.size(); ++i) {
        Declaration *pdecl = d->functionParams[i];
        assert(pdecl->declarators.size() == 1);
        funArgs->push_back(pdecl->declarators[0]->GetSymbol());
    }
    return funSym;
 }
@@ -211,7 +228,6 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    bool hasUniformQual = ((typeQualifiers & TYPEQUAL_UNIFORM) != 0);
    bool hasVaryingQual = ((typeQualifiers & TYPEQUAL_VARYING) != 0);
    bool isTask =         ((typeQualifiers & TYPEQUAL_TASK) != 0);
    bool isReference =    ((typeQualifiers & TYPEQUAL_REFERENCE) != 0);
    bool isConst =        ((typeQualifiers & TYPEQUAL_CONST) != 0);
    if (hasUniformQual && hasVaryingQual) {
@@ -224,13 +240,36 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
    const Type *type = base;
    switch (kind) {
    case DK_BASE:
        // All of the type qualifiers should be in the DeclSpecs for the
        // base declarator
        assert(typeQualifiers == 0);
        assert(child == NULL);
        return type;
    case DK_POINTER:
        type = new PointerType(type, hasUniformQual, isConst);
-        if (child)
+        if (child != NULL)
            return child->GetType(type, ds);
        else
            return type;
        break;
    case DK_REFERENCE:
        if (hasUniformQual)
            Error(pos, "\"uniform\" qualifier is illegal to apply to references.");
        if (hasVaryingQual)
            Error(pos, "\"varying\" qualifier is illegal to apply to references.");
        if (isConst)
            Error(pos, "\"const\" qualifier is to illegal apply to references.");
        // The parser should disallow this already, but double check.
        if (dynamic_cast<const ReferenceType *>(type) != NULL) {
            Error(pos, "References to references are illegal.");
            return NULL;
        }
        type = new ReferenceType(type);
        if (child != NULL)
            return child->GetType(type, ds);
        else
            return type;
@@ -250,10 +289,12 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        std::vector<ConstExpr *> argDefaults;
        std::vector<SourcePos> argPos;
-        // Loop over the function arguments and get names and types for
+        // Loop over the function arguments and store the names, types,
-        // each one in the args and argNames arrays
+        // default values (if any), and source file positions each one in
-        for (unsigned int i = 0; i < functionArgs.size(); ++i) {
+        // the corresponding vector.
-            Declaration *d = functionArgs[i];
+        for (unsigned int i = 0; i < functionParams.size(); ++i) {
            Declaration *d = functionParams[i];
            char buf[32];
            Symbol *sym;
            if (d->declarators.size() == 0) {
@@ -266,6 +307,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            else {
                sym = d->declarators[0]->GetSymbol();
                if (sym == NULL) {
                    // Handle more complex anonymous declarations like
                    // float (float **).
                    sprintf(buf, "__anon_parameter_%d", i);
                    sym = new Symbol(buf, pos);
                    sym->type = d->declarators[0]->GetType(d->declSpecs);
@@ -274,9 +317,15 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
            if (at != NULL) {
-                // Arrays are passed by reference, so convert array
+                // As in C, arrays are passed to functions as pointers to
-                // parameters to be references here.
+                // their element type.  We'll just immediately make this
-                sym->type = new ReferenceType(sym->type, sym->type->IsConstType());
+                // change now.  (One shortcoming of losing the fact that
                // the it was originally an array is that any warnings or
                // errors later issued that print the function type will
                // report this differently than it was originally declared
                // in the function, but it's not clear that this is a
                // significant problem.)
                sym->type = PointerType::GetUniform(at->GetElementType());
                // Make sure there are no unsized arrays (other than the
                // first dimension) in function parameter lists.
@@ -296,6 +345,8 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            ConstExpr *init = NULL;
            if (d->declarators.size()) {
                // Try to find an initializer expression; if there is one,
                // it lives down to the base declarator.
                Declarator *decl = d->declarators[0];
                while (decl->child != NULL) {
                    assert(decl->initExpr == NULL);
@@ -314,11 +365,6 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
            argDefaults.push_back(init);
        }
        if (isReference) {
            Error(pos, "Function return types can't be reference types.");
            return NULL;
        }
        const Type *returnType = type;
        if (returnType == NULL) {
            Error(pos, "No return type provided in function declaration.");
@@ -328,6 +374,23 @@ Declarator::GetType(const Type *base, DeclSpecs *ds) const {
        bool isExported = ds && (ds->storageClass == SC_EXPORT);
        bool isExternC =  ds && (ds->storageClass == SC_EXTERN_C);
        bool isTask =     ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
        if (isExported && isTask) {
            Error(pos, "Function can't have both \"task\" and \"export\" "
                  "qualifiers");
            return NULL;
        }
        if (isExternC && isTask) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
                  "qualifiers");
            return NULL;
        }
        if (isExternC && isExported) {
            Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
                  "qualifiers");
            return NULL;
        }
        Type *functionType = 
            new FunctionType(returnType, args, pos, argNames, argDefaults,
                             argPos, isTask, isExported, isExternC);
@@ -367,12 +430,6 @@ const Type *
 Declarator::GetType(DeclSpecs *ds) const {
    const Type *baseType = ds->GetBaseType(pos);
    const Type *type = GetType(baseType, ds);
    if ((ds->typeQualifiers & TYPEQUAL_REFERENCE) != 0) {
        bool hasConstQual = ((ds->typeQualifiers & TYPEQUAL_CONST) != 0);
        type = new ReferenceType(type, hasConstQual);
    }
    return type;
 }
@@ -392,7 +449,7 @@ Declaration::Declaration(DeclSpecs *ds, std::vector<Declarator *> *dlist) {
 Declaration::Declaration(DeclSpecs *ds, Declarator *d) {
    declSpecs = ds;
-    if (d) {
+    if (d != NULL) {
        d->InitFromDeclSpecs(ds);
        declarators.push_back(d);
    }
@@ -409,6 +466,8 @@ Declaration::GetVariableDeclarations() const {
            continue;
        Declarator *decl = declarators[i];
        if (decl == NULL || decl->kind == DK_FUNCTION) 
            // Ignore earlier errors or external function declarations
            // inside other functions.
            continue;
        Symbol *sym = decl->GetSymbol();
@@ -452,14 +511,18 @@ GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
            Declarator *d = (*sd[i]->declarators)[j];
            d->InitFromDeclSpecs(&ds);
            // if it's an unsized array, make it a reference to an unsized
            // array, so the caller can pass a pointer...
            Symbol *sym = d->GetSymbol();
            const ArrayType *at = dynamic_cast<const ArrayType *>(sym->type);
            if (at && at->GetElementCount() == 0)
                sym->type = new ReferenceType(sym->type, type->IsConstType());
            const ArrayType *arrayType = 
                dynamic_cast<const ArrayType *>(sym->type);
            if (arrayType != NULL && arrayType->GetElementCount() == 0) {
                Error(d->pos, "Unsized arrays aren't allowed in struct "
                      "definitions.");
                elementTypes->push_back(NULL);
            }
            else
                elementTypes->push_back(sym->type);
            elementNames->push_back(sym->name);
            elementPositions->push_back(sym->pos);
        }
--- a/decl.h
+++ b/decl.h
@@ -79,9 +79,8 @@ enum StorageClass {
 #define TYPEQUAL_UNIFORM    (1<<1)
 #define TYPEQUAL_VARYING    (1<<2)
 #define TYPEQUAL_TASK       (1<<3)
-#define TYPEQUAL_REFERENCE  (1<<4)
+#define TYPEQUAL_UNSIGNED   (1<<4)
-#define TYPEQUAL_UNSIGNED   (1<<5)
+#define TYPEQUAL_INLINE     (1<<5)
 #define TYPEQUAL_INLINE     (1<<6)
 /** @brief Representation of the declaration specifiers in a declaration.
@@ -100,7 +99,7 @@ public:
    int typeQualifiers;
    /** The basic type provided in the declaration; this should be an
-        AtomicType, a StructType, or a VectorType; other types (like
+        AtomicType, EnumType, StructType, or VectorType; other types (like
        ArrayTypes) will end up being created if a particular declaration
        has an array size, etc.
    */
@@ -123,6 +122,7 @@ public:
 enum DeclaratorKind {
    DK_BASE,
    DK_POINTER,
    DK_REFERENCE,
    DK_ARRAY,
    DK_FUNCTION
 };
@@ -142,33 +142,51 @@ public:
    void InitFromDeclSpecs(DeclSpecs *ds);
    /** Get the actual type of the combination of Declarator and the given
-        DeclSpecs */
+        DeclSpecs.  If an explicit base type is provided, the declarator is
        applied to that type; otherwise the base type from the DeclSpecs is
        used. */
    const Type *GetType(DeclSpecs *ds) const;
    const Type *GetType(const Type *base, DeclSpecs *ds) const;
-    void GetFunctionInfo(DeclSpecs *ds, Symbol **sym, 
+    /** Returns the symbol corresponding to the function declared by this
-                         std::vector<Symbol *> *args);
+        declarator and symbols for its arguments in *args. */
    Symbol *GetFunctionInfo(DeclSpecs *ds, std::vector<Symbol *> *args);
-    Symbol *GetSymbol();
+    /** Returns the symbol associated with the declarator. */
    Symbol *GetSymbol() const;
    void Print() const;
    /** Position of the declarator in the source program. */
    const SourcePos pos;
    /** The kind of this declarator; complex declarations are assembled as
        a hierarchy of Declarators.  (For example, a pointer to an int
        would have a root declarator with kind DK_POINTER and with the
        Declarator::child member pointing to a DK_BASE declarator for the
        int). */
    const DeclaratorKind kind;
    /** Child pointer if needed; this can only be non-NULL if the
        declarator's kind isn't DK_BASE. */
    Declarator *child;
    /** Type qualifiers provided with the declarator. */
    int typeQualifiers;
    /** For array declarators, this gives the declared size of the array.
        Unsized arrays have arraySize == 0. */ 
    int arraySize;
    /** Symbol associated with the declarator. */
    Symbol *sym;
    /** Initialization expression for the variable.  May be NULL. */
    Expr *initExpr;
-    std::vector<Declaration *> functionArgs;
+    /** For function declarations, this holds the Declaration *s for the
        funciton's parameters. */
    std::vector<Declaration *> functionParams;
 };
@@ -182,6 +200,11 @@ public:
    void Print() const;
    /** This method walks through all of the Declarators in a declaration
        and returns a fully-initialized Symbol and (possibly) and
        initialization expression for each one.  (This allows the rest of
        the system to not have to worry about the mess of the general
        Declarator representation.) */
    std::vector<VariableDeclaration> GetVariableDeclarations() const;
    DeclSpecs *declSpecs;
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;
    float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;
-            float theta = sqrt(frandom(rngstate));
+            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -205,7 +203,7 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 */
 static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
                         uniform int h,  uniform int nsubsamples, 
-                         reference uniform float image[]) {
+                         uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -213,7 +211,7 @@ static void ao_scanlines(uniform int y0, uniform int y1, uniform int w,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
--- a/examples/aobench_instrumented/ao.ispc
+++ b/examples/aobench_instrumented/ao.ispc
@@ -75,7 +75,7 @@ static inline vec vcross(vec v0, vec v1) {
    return ret;
 }
-static inline void vnormalize(reference vec v) {
+static inline void vnormalize(vec &v) {
    float len2 = dot(v, v);
    float invlen = rsqrt(len2);
    v *= invlen;
@@ -83,8 +83,7 @@ static inline void vnormalize(reference vec v) {
 static inline void
-ray_plane_intersect(reference Isect isect, reference Ray ray, 
+ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
                    reference Plane plane) {
    float d = -dot(plane.p, plane.n);
    float v = dot(ray.dir, plane.n);
@@ -104,8 +103,7 @@ ray_plane_intersect(reference Isect isect, reference Ray ray,
 static inline void
-ray_sphere_intersect(reference Isect isect, reference Ray ray, 
+ray_sphere_intersect(Isect &isect, Ray &ray, Sphere &sphere) {
                     reference Sphere sphere) {
    vec rs = ray.org - sphere.center;
    float B = dot(rs, ray.dir);
@@ -127,7 +125,7 @@ ray_sphere_intersect(reference Isect isect, reference Ray ray,
 static inline void
-orthoBasis(reference vec basis[3], vec n) {
+orthoBasis(vec basis[3], vec n) {
    basis[2] = n;
    basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0;
@@ -150,8 +148,8 @@ orthoBasis(reference vec basis[3], vec n) {
 static inline float
-ambient_occlusion(reference Isect isect, reference Plane plane, 
+ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
-                  reference Sphere spheres[3], reference RNGState rngstate) {
+                  RNGState &rngstate) {
    float eps = 0.0001f;
    vec p, n;
    vec basis[3];
@@ -168,8 +166,8 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
            Ray ray;
            Isect occIsect;
-            float theta = sqrt(frandom(rngstate));
+            float theta = sqrt(frandom(&rngstate));
-            float phi   = 2.0f * M_PI * frandom(rngstate);
+            float phi   = 2.0f * M_PI * frandom(&rngstate);
            float x = cos(phi) * theta;
            float y = sin(phi) * theta;
            float z = sqrt(1.0 - theta * theta);
@@ -203,8 +201,9 @@ ambient_occlusion(reference Isect isect, reference Plane plane,
 /* Compute the image for the scanlines from [y0,y1), for an overall image
   of width w and height h.
 */
-void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, 
+static void ao_scanlines(uniform int y0, uniform int y1, uniform int w, 
-                  uniform int nsubsamples, reference uniform float image[]) {
+                         uniform int h,  uniform int nsubsamples, 
                         uniform float image[]) {
    static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } };
    static Sphere spheres[3] = {
        { { -2.0f, 0.0f, -3.5f }, 0.5f },
@@ -212,7 +211,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
        { { 1.0f, 0.0f, -2.2f }, 0.5f } };
    RNGState rngstate;
-    seed_rng(rngstate, y0);
+    seed_rng(&rngstate, y0);
    // Compute the mapping between the 'programCount'-wide program
    // instances running in parallel and samples in the image.  
@@ -231,6 +230,9 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
    // direction we do per iteration and ny the number in y.
    uniform int nx = 1, ny = 1;
    // FIXME: We actually need ny to be 1 regardless of the decomposition,
    // since the task decomposition is one scanline high.
    if (programCount == 8) {
        // Do two pixels at once in the x direction
        nx = 2;
@@ -239,19 +241,21 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            ++du;
    }
    else if (programCount == 16) {
-        // Two at once in both x and y
+        nx = 4;
-        nx = ny = 2;
+        ny = 1;
-        if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12)
+        if (programIndex >= 4 && programIndex < 8)
            ++du;
-        if (programIndex >= 8)  
+        if (programIndex >= 8 && programIndex < 12)
-            ++dv;
+            du += 2;
        if (programIndex >= 12)
            du += 3;
    }
    // Now loop over all of the pixels, stepping in x and y as calculated
    // above.  (Assumes that ny divides y and nx divides x...)
    for (uniform int y = y0; y < y1; y += ny) {
        for (uniform int x = 0; x < w; x += nx)  {
-            // Figur out x,y pixel in NDC
+            // Figure out x,y pixel in NDC
            float px =  (x + du - (w / 2.0f)) / (w / 2.0f);
            float py = -(y + dv - (h / 2.0f)) / (h / 2.0f);
            float ret = 0.f;
@@ -293,7 +297,7 @@ void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h,
            // offset to the first pixel in the image
            uniform int offset = 3 * (y * w + x);
-            for (uniform int p = 0; p < programCount; p += 4, ++offset) {
+            for (uniform int p = 0; p < programCount; p += 4, offset += 3) {
                // Get the four sample values for this pixel
                uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] +
                    retArray[p+3];
@@ -315,3 +319,15 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples,
                    uniform float image[]) {
    ao_scanlines(0, h, w, h, nsubsamples, image);
 }
 static void task ao_task(uniform int width, uniform int height, 
                         uniform int nsubsamples, uniform float image[]) {
    ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image);
 }
 export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, 
                          uniform float image[]) {
    launch[h] < ao_task(w, h, nsubsamples, image) >;
 }
--- a/examples/deferred/kernels.ispc
+++ b/examples/deferred/kernels.ispc
@@ -35,22 +35,22 @@
 struct InputDataArrays
 {
-    uniform float zBuffer[];
+    uniform float * uniform zBuffer;
-    uniform unsigned int16 normalEncoded_x[]; // half float
+    uniform unsigned int16 * uniform normalEncoded_x; // half float
-    uniform unsigned int16 normalEncoded_y[]; // half float
+    uniform unsigned int16 * uniform normalEncoded_y; // half float
-    uniform unsigned int16 specularAmount[]; // half float
+    uniform unsigned int16 * uniform specularAmount; // half float
-    uniform unsigned int16 specularPower[]; // half float
+    uniform unsigned int16 * uniform specularPower; // half float
-    uniform unsigned int8 albedo_x[]; // unorm8
+    uniform unsigned int8 * uniform albedo_x; // unorm8
-    uniform unsigned int8 albedo_y[]; // unorm8
+    uniform unsigned int8 * uniform albedo_y; // unorm8
-    uniform unsigned int8 albedo_z[]; // unorm8
+    uniform unsigned int8 * uniform albedo_z; // unorm8
-    uniform float lightPositionView_x[];
+    uniform float * uniform lightPositionView_x;
-    uniform float lightPositionView_y[];
+    uniform float * uniform lightPositionView_y;
-    uniform float lightPositionView_z[];
+    uniform float * uniform lightPositionView_z;
-    uniform float lightAttenuationBegin[];
+    uniform float * uniform lightAttenuationBegin;
-    uniform float lightColor_x[];
+    uniform float * uniform lightColor_x;
-    uniform float lightColor_y[];
+    uniform float * uniform lightColor_y;
-    uniform float lightColor_z[];
+    uniform float * uniform lightColor_z;
-    uniform float lightAttenuationEnd[];
+    uniform float * uniform lightAttenuationEnd;
 };
 struct InputHeader
@@ -77,8 +77,7 @@ dot3(float x, float y, float z, float a, float b, float c) {
 static inline void
-normalize3(float x, float y, float z, reference float ox, 
+normalize3(float x, float y, float z, float &ox, float &oy, float &oz) {
           reference float oy, reference float oz) {
    float n = rsqrt(x*x + y*y + z*z);
    ox = x * n;
    oy = y * n;
@@ -110,8 +109,8 @@ ComputeZBounds(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZ,
+    uniform float &minZ,
-    reference uniform float maxZ
+    uniform float &maxZ
    )
 {
    // Find Z bounds
@@ -156,7 +155,7 @@ IntersectLightsWithTileMinMax(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
@@ -268,7 +267,7 @@ IntersectLightsWithTile(
    uniform float light_positionView_z_array[],
    uniform float light_attenuationEnd_array[],
    // Output
-    reference uniform int32 tileLightIndices[]
+    uniform int32 tileLightIndices[]
    )
 {
    uniform float minZ, maxZ;
@@ -293,19 +292,19 @@ ShadeTile(
    uniform int32 tileStartX, uniform int32 tileEndX,
    uniform int32 tileStartY, uniform int32 tileEndY,
    uniform int32 gBufferWidth, uniform int32 gBufferHeight,
-    reference uniform InputDataArrays inputData,
+    uniform InputDataArrays &inputData,
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    uniform float cameraProj_33, uniform float cameraProj_43,
    // Light list
-    reference uniform int32 tileLightIndices[],
+    uniform int32 tileLightIndices[],
    uniform int32 tileNumLights,
    // UI
    uniform bool visualizeLightCount,
    // Output
-    reference uniform unsigned int8 framebuffer_r[],
+    uniform unsigned int8 framebuffer_r[],
-    reference uniform unsigned int8 framebuffer_g[],
+    uniform unsigned int8 framebuffer_g[],
-    reference uniform unsigned int8 framebuffer_b[]
+    uniform unsigned int8 framebuffer_b[]
    )
 {
    if (tileNumLights == 0 || visualizeLightCount) {
@@ -478,13 +477,13 @@ ShadeTile(
 task void
 RenderTile(uniform int num_groups_x, uniform int num_groups_y,
-           reference uniform InputHeader inputHeader,
+           uniform InputHeader &inputHeader,
-           reference uniform InputDataArrays inputData,
+           uniform InputDataArrays &inputData,
           uniform int visualizeLightCount,
           // Output
-           reference uniform unsigned int8 framebuffer_r[],
+           uniform unsigned int8 framebuffer_r[],
-           reference uniform unsigned int8 framebuffer_g[],
+           uniform unsigned int8 framebuffer_g[],
-           reference uniform unsigned int8 framebuffer_b[]) {
+           uniform unsigned int8 framebuffer_b[]) {
    uniform int32 group_y = taskIndex / num_groups_x;
    uniform int32 group_x = taskIndex % num_groups_x;
    uniform int32 tile_start_x = group_x * MIN_TILE_WIDTH;
@@ -526,13 +525,13 @@ RenderTile(uniform int num_groups_x, uniform int num_groups_y,
 export void
-RenderStatic(reference uniform InputHeader inputHeader,
+RenderStatic(uniform InputHeader &inputHeader,
-             reference uniform InputDataArrays inputData,
+             uniform InputDataArrays &inputData,
             uniform int visualizeLightCount,
             // Output
-             reference uniform unsigned int8 framebuffer_r[],
+             uniform unsigned int8 framebuffer_r[],
-             reference uniform unsigned int8 framebuffer_g[],
+             uniform unsigned int8 framebuffer_g[],
-             reference uniform unsigned int8 framebuffer_b[]) {
+             uniform unsigned int8 framebuffer_b[]) {
    uniform int num_groups_x = (inputHeader.framebufferWidth + 
                                MIN_TILE_WIDTH - 1) / MIN_TILE_WIDTH;
    uniform int num_groups_y = (inputHeader.framebufferHeight + 
@@ -564,8 +563,8 @@ ComputeZBoundsRow(
    uniform float cameraProj_33, uniform float cameraProj_43,
    uniform float cameraNear, uniform float cameraFar,
    // Output
-    reference uniform float minZArray[],
+    uniform float minZArray[],
-    reference uniform float maxZArray[]
+    uniform float maxZArray[]
    )
 {
    for (uniform int32 tileX = 0; tileX < numTilesX; ++tileX) {
@@ -596,7 +595,7 @@ SplitTileMinMax(
    // Camera data
    uniform float cameraProj_11, uniform float cameraProj_22,
    // Light Data
-    reference uniform int32 lightIndices[],
+    uniform int32 lightIndices[],
    uniform int32 numLights,
    uniform float light_positionView_x_array[],
    uniform float light_positionView_y_array[],
@@ -605,9 +604,9 @@ SplitTileMinMax(
    // Outputs
    // TODO: ISPC doesn't currently like multidimensionsal arrays so we'll do the
    // indexing math ourselves
-    reference uniform int32 subtileIndices[],
+    uniform int32 subtileIndices[],
    uniform int32 subtileIndicesPitch,
-    reference uniform int32 subtileNumLights[]
+    uniform int32 subtileNumLights[]
    )
 {
    uniform float gBufferScale_x = 0.5f * (float)gBufferWidth;
--- a/examples/mandelbrot/mandelbrot.ispc
+++ b/examples/mandelbrot/mandelbrot.ispc
@@ -51,7 +51,7 @@ export void mandelbrot_ispc(uniform float x0, uniform float y0,
                            uniform float x1, uniform float y1,
                            uniform int width, uniform int height, 
                            uniform int maxIterations,
-                            reference uniform int output[])
+                            uniform int output[])
 {
    float dx = (x1 - x0) / width;
    float dy = (y1 - y0) / height;
--- a/examples/mandelbrot_tasks/mandelbrot.ispc
+++ b/examples/mandelbrot_tasks/mandelbrot.ispc
@@ -57,7 +57,7 @@ mandelbrot_scanlines(uniform int ybase, uniform int span,
                     uniform float x0, uniform float dx, 
                     uniform float y0, uniform float dy,
                     uniform int width, uniform int maxIterations,
-                     reference uniform int output[]) {
+                     uniform int output[]) {
    uniform int ystart = ybase + taskIndex * span;
    uniform int yend = ystart + span;
@@ -77,7 +77,7 @@ task void
 mandelbrot_chunk(uniform float x0, uniform float dx,
                 uniform float y0, uniform float dy,
                 uniform int width, uniform int height,
-                 uniform int maxIterations, reference uniform int output[]) {
+                 uniform int maxIterations, uniform int output[]) {
    uniform int ystart = taskIndex * (height/taskCount);
    uniform int yend = (taskIndex+1) * (height/taskCount);
    uniform int span = 1;
@@ -91,7 +91,7 @@ export void
 mandelbrot_ispc(uniform float x0, uniform float y0, 
                uniform float x1, uniform float y1,
                uniform int width, uniform int height, 
-                uniform int maxIterations, reference uniform int output[]) {
+                uniform int maxIterations, uniform int output[]) {
    uniform float dx = (x1 - x0) / width;
    uniform float dy = (y1 - y0) / height;
--- a/examples/rt/rt.ispc
+++ b/examples/rt/rt.ispc
@@ -73,7 +73,7 @@ static inline float Dot(const float3 a, const float3 b) {
 static void generateRay(uniform const float raster2camera[4][4], 
                        uniform const float camera2world[4][4],
-                        float x, float y, reference Ray ray) {
+                        float x, float y, Ray &ray) {
    ray.mint = 0.f;
    ray.maxt = 1e30f;
@@ -105,7 +105,7 @@ static void generateRay(uniform const float raster2camera[4][4],
 static inline bool BBoxIntersect(const uniform float bounds[2][3], 
-                                 const reference Ray ray) {
+                                 const Ray &ray) {
    uniform float3 bounds0 = { bounds[0][0], bounds[0][1], bounds[0][2] };
    uniform float3 bounds1 = { bounds[1][0], bounds[1][1], bounds[1][2] };
    float t0 = ray.mint, t1 = ray.maxt;
@@ -143,7 +143,7 @@ static inline bool BBoxIntersect(const uniform float bounds[2][3],
-static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) {
+static inline bool TriIntersect(const Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
@@ -184,7 +184,7 @@ static inline bool TriIntersect(const reference Triangle tri, reference Ray ray)
 bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], 
-                  reference Ray r) {
+                  Ray &r) {
    Ray ray = r;
    bool hit = false;
    // Follow ray through BVH nodes to find primitive intersections
--- a/examples/volume_rendering/Makefile
+++ b/examples/volume_rendering/Makefile
@@ -8,7 +8,7 @@ TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o)))
 CXX=g++
 CXXFLAGS=-Iobjs/ -O3 -Wall -m64
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64
+ISPCFLAGS=-O2 --target=sse2,sse4-x2 --arch=x86-64 --opt=32-bit-addressing
 OBJS=objs/volume.o objs/volume_serial.o $(TASK_OBJ) objs/volume_ispc.o \
 	objs/volume_ispc_sse2.o objs/volume_ispc_sse4.o
--- a/examples/volume_rendering/volume.ispc
+++ b/examples/volume_rendering/volume.ispc
@@ -41,7 +41,7 @@ struct Ray {
 static void
 generateRay(const uniform float raster2camera[4][4], 
            const uniform float camera2world[4][4],
-            float x, float y, reference Ray ray) {
+            float x, float y, Ray &ray) {
    // transform raster coordinate (x, y, 0) to camera space
    float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3];
    float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3];
@@ -70,7 +70,7 @@ Inside(float3 p, float3 pMin, float3 pMax) {
 static bool
-IntersectP(Ray ray, float3 pMin, float3 pMax, reference float hit0, reference float hit1) {
+IntersectP(Ray ray, float3 pMin, float3 pMax, float &hit0, float &hit1) {
    float t0 = -1e30, t1 = 1e30;
    float3 tNear = (pMin - ray.origin) / ray.dir;
@@ -141,7 +141,7 @@ static inline float3 Offset(float3 p, float3 pMin, float3 pMax) {
 static inline float Density(float3 Pobj, float3 pMin, float3 pMax, 
                            uniform float density[], uniform int nVoxels[3],
-                            reference uniform bool checkForSameVoxel) {
+                            uniform bool &checkForSameVoxel) {
    if (!Inside(Pobj, pMin, pMax)) 
        return 0;
    // Compute voxel coordinates and offsets for _Pobj_
@@ -155,8 +155,8 @@ static inline float Density(float3 Pobj, float3 pMin, float3 pMax,
    // Trilinearly interpolate density values to compute local density
    float d00, d10, d01, d11;
    uniform int uvx, uvy, uvz;
-    if (checkForSameVoxel && reduce_equal(vx, uvx) && reduce_equal(vy, uvy) &&
+    if (checkForSameVoxel && reduce_equal(vx, &uvx) && reduce_equal(vy, &uvy) &&
-        reduce_equal(vz, uvz)) {
+        reduce_equal(vz, &uvz)) {
        // If all of the program instances are inside the same voxel, then
        // we'll call the 'uniform' variant of the voxel density lookup
        // function, thus doing a single load for each value rather than a
--- a/examples/volume_rendering/volume.vcxproj
+++ b/examples/volume_rendering/volume.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -158,13 +158,13 @@
      <FileType>Document</FileType>
      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 --opt=32-bit-addressing
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2
 </Command>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2 --opt=32-bit-addressing
 </Command>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_ispc.h</Outputs>
--- a/expr.cpp
+++ b/expr.cpp
--- a/expr.h
+++ b/expr.h
@@ -65,6 +65,10 @@ public:
    /** Returns the Type of the expression. */
    virtual const Type *GetType() const = 0;
    /** Returns the type of the value returned by GetLValueType(); this
        should be a pointer type of some sort (uniform or varying). */
    virtual const Type *GetLValueType() const;
    /** For expressions that have values based on a symbol (e.g. regular
        symbol references, array indexing, etc.), this returns a pointer to
        that symbol. */
@@ -266,11 +270,12 @@ public:
 */
 class IndexExpr : public Expr {
 public:
-    IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p);
+    IndexExpr(Expr *baseExpr, Expr *index, SourcePos p);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
@@ -278,7 +283,7 @@ public:
    Expr *TypeCheck();
    int EstimateCost() const;
-    Expr *arrayOrVector, *index;
+    Expr *baseExpr, *index;
 };
@@ -288,15 +293,13 @@ public:
 */
 class MemberExpr : public Expr {
 public:
-    static MemberExpr* create(Expr *expr, const char *identifier,
+    static MemberExpr *create(Expr *expr, const char *identifier,
-                              SourcePos pos, SourcePos identifierPos);
+                              SourcePos pos, SourcePos identifierPos,
-
+                              bool derefLvalue);
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *Optimize();
@@ -310,6 +313,15 @@ public:
    Expr *expr;
    std::string identifier;
    const SourcePos identifierPos;
 protected:
    MemberExpr(Expr *expr, const char *identifier, SourcePos pos, 
               SourcePos identifierPos, bool derefLValue);
    /** Indicates whether the expression should be dereferenced before the
        member is found.  (i.e. this is true if the MemberExpr was a '->'
        operator, and is false if it was a '.' operator. */
    bool dereferenceExpr;
 };
@@ -506,6 +518,7 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *TypeCheck();
@@ -525,6 +538,7 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *TypeCheck();
@@ -535,6 +549,44 @@ public:
 };
 /** Expression that represents taking the address of an expression. */
 class AddressOfExpr : public Expr {
 public:
    AddressOfExpr(Expr *e, SourcePos p);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    Symbol *GetBaseSymbol() const;
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
    Expr *expr;
 };
 /** Expression that returns the size of the given expression or type in
    bytes. */
 class SizeOfExpr : public Expr {
 public:
    SizeOfExpr(Expr *e, SourcePos p);
    SizeOfExpr(const Type *t, SourcePos p);
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    void Print() const;
    Expr *TypeCheck();
    Expr *Optimize();
    int EstimateCost() const;
    /* One of expr or type should be non-NULL (but not both of them).  The
       SizeOfExpr returns the size of whichever one of them isn't NULL. */
    Expr *expr;
    const Type *type;
 };
 /** @brief Expression representing a symbol reference in the program */
 class SymbolExpr : public Expr {
 public:
@@ -543,6 +595,7 @@ public:
    llvm::Value *GetValue(FunctionEmitContext *ctx) const;
    llvm::Value *GetLValue(FunctionEmitContext *ctx) const;
    const Type *GetType() const;
    const Type *GetLValueType() const;
    Symbol *GetBaseSymbol() const;
    Expr *TypeCheck();
    Expr *Optimize();
@@ -623,9 +676,13 @@ public:
 /** This function indicates whether it's legal to convert from fromType to
-    toType.
+    toType.  If the optional errorMsgBase and source position parameters
    are provided, then an error message is issued if the type conversion
    isn't possible.
 */
-bool CanConvertTypes(const Type *fromType, const Type *toType);
+bool CanConvertTypes(const Type *fromType, const Type *toType,
                     const char *errorMsgBase = NULL,
                     SourcePos pos = SourcePos());
 /** This function attempts to convert the given expression to the given
    type, returning a pointer to a new expression that is the result.  If
--- a/func.cpp
+++ b/func.cpp
@@ -74,10 +74,32 @@ Function::Function(Symbol *s, const std::vector<Symbol *> &a, Stmt *c) {
    maskSymbol = m->symbolTable->LookupVariable("__mask");
    assert(maskSymbol != NULL);
-    if (code) {
+    if (code != NULL) {
        if (g->debugPrint) {
            fprintf(stderr, "Creating function \"%s\".  Initial code:\n", 
                    sym->name.c_str());
            code->Print(0);
            fprintf(stderr, "---------------------\n");
        }
        code = code->TypeCheck();
-        if (code)
+
        if (code != NULL && g->debugPrint) {
            fprintf(stderr, "After typechecking function \"%s\":\n", 
                    sym->name.c_str());
            code->Print(0);
            fprintf(stderr, "---------------------\n");
        }
        if (code != NULL) {
            code = code->Optimize();
            if (g->debugPrint) {
                fprintf(stderr, "After optimizing function \"%s\":\n", 
                        sym->name.c_str());
                code->Print(0);
                fprintf(stderr, "---------------------\n");
            }
        }
    }
    if (g->debugPrint) {
@@ -149,11 +171,11 @@ lCopyInTaskParameter(int i, llvm::Value *structArgPtr, const std::vector<Symbol
    sym->storagePtr = ctx->AllocaInst(argType, sym->name.c_str());
    // get a pointer to the value in the struct
-    llvm::Value *ptr = ctx->GetElementPtrInst(structArgPtr, 0, i, sym->name.c_str());
+    llvm::Value *ptr = ctx->AddElementOffset(structArgPtr, i, NULL, sym->name.c_str());
    // and copy the value from the struct and into the local alloca'ed
    // memory
-    llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, NULL, sym->name.c_str());
+    llvm::Value *ptrval = ctx->LoadInst(ptr, sym->name.c_str());
    ctx->StoreInst(ptrval, sym->storagePtr);
    ctx->EmitFunctionParameterDebugInfo(sym);
 }
@@ -200,9 +222,9 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        // Copy in the mask as well.
        int nArgs = (int)args.size();
        // The mask is the last parameter in the argument structure
-        llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
+        llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
                                                  "task_struct_mask");
-        llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, NULL, "mask");
+        llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
        ctx->SetFunctionMask(ptrval);
        // Copy threadIndex and threadCount into stack-allocated storage so
@@ -236,7 +258,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
        }
        // If the number of actual function arguments is equal to the
-        // number of declared arguments in decl->functionArgs, then we
+        // number of declared arguments in decl->functionParams, then we
        // don't have a mask parameter, so set it to be all on.  This
        // happens for exmaple with 'export'ed functions that the app
        // calls.
@@ -338,11 +360,8 @@ Function::GenerateIR() {
    if (m->errorCount == 0) {
        if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
-            if (g->debugPrint) {
+            if (g->debugPrint)
-                llvm::PassManager ppm;
+                function->dump();
                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
                ppm.run(*m->module);
            }
            FATAL("Function verificication failed");
        }
@@ -376,11 +395,8 @@ Function::GenerateIR() {
                        sym->exportedFunction = appFunction;
                        if (llvm::verifyFunction(*appFunction, 
                                                 llvm::ReturnStatusAction) == true) {
-                            if (g->debugPrint) {
+                            if (g->debugPrint)
-                                llvm::PassManager ppm;
+                                appFunction->dump();
                                ppm.add(llvm::createPrintModulePass(&llvm::outs()));
                                ppm.run(*m->module);
                            }
                            FATAL("Function verificication failed");
                        }
                    }
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -171,7 +171,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
    if (!error) {
        llvm::TargetMachine *targetMachine = t->GetTargetMachine();
        const llvm::TargetData *targetData = targetMachine->getTargetData();
-        t->is32bit = (targetData->getPointerSize() == 4);
+        t->is32Bit = (targetData->getPointerSize() == 4);
    }
    return !error;
@@ -284,8 +284,11 @@ llvm::Value *
 Target::SizeOf(LLVM_TYPE_CONST llvm::Type *type) {
    const llvm::TargetData *td = GetTargetMachine()->getTargetData();
    assert(td != NULL);
-    return is32bit ? LLVMInt32(td->getTypeSizeInBits(type) / 8) :
+    uint64_t byteSize = td->getTypeSizeInBits(type) / 8;
-        LLVMInt64(td->getTypeSizeInBits(type) / 8);
+    if (is32Bit || g->opt.force32BitAddressing)
        return LLVMInt32(byteSize);
    else
        return LLVMInt64(byteSize);
 }
@@ -298,7 +301,12 @@ Target::StructOffset(LLVM_TYPE_CONST llvm::Type *type, int element) {
    assert(structType != NULL);
    const llvm::StructLayout *sl = td->getStructLayout(structType);
    assert(sl != NULL);
-    return LLVMInt32(sl->getElementOffset(element));
+
    uint64_t offset = sl->getElementOffset(element);
    if (is32Bit || g->opt.force32BitAddressing)
        return LLVMInt32(offset);
    else
        return LLVMInt64(offset);
 }
@@ -309,6 +317,7 @@ Opt::Opt() {
    level = 1;
    fastMath = false;
    fastMaskedVload = false;
    force32BitAddressing = false;
    unrollLoops = true;
    disableAsserts = false;
    disableHandlePseudoMemoryOps = false;
--- a/ispc.h
+++ b/ispc.h
@@ -187,7 +187,7 @@ struct Target {
    std::string arch;
    /** Is the target architecture 32 or 64 bit */
-    bool is32bit;
+    bool is32Bit;
    /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */
    std::string cpu;
@@ -237,6 +237,12 @@ struct Opt {
        it will make sense. */
    bool unrollLoops;
    /** Indicates if addressing math will be done with 32-bit math, even on
        64-bit systems.  (This is generally noticably more efficient,
        though at the cost of addressing >2GB).
     */ 
    bool force32BitAddressing;
    /** Indicates whether assert() statements should be ignored (for
        performance in the generated code). */
    bool disableAsserts;
--- a/lex.ll
+++ b/lex.ll
@@ -112,9 +112,12 @@ int64 { return TOKEN_INT64; }
 launch { return TOKEN_LAUNCH; }
 NULL { return TOKEN_NULL; }
 print { return TOKEN_PRINT; }
-reference { return TOKEN_REFERENCE; }
+reference { Error(*yylloc, "\"reference\" qualifier is no longer supported; "
                           "please use C++-style '&' syntax for references "
                           "instead."); }
 return { return TOKEN_RETURN; }
 soa { return TOKEN_SOA; }
 sizeof { return TOKEN_SIZEOF; }
 static { return TOKEN_STATIC; }
 struct { return TOKEN_STRUCT; }
 switch { return TOKEN_SWITCH; }
@@ -223,6 +226,7 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 "&=" { return TOKEN_AND_ASSIGN; }
 "^=" { return TOKEN_XOR_ASSIGN; }
 "|=" { return TOKEN_OR_ASSIGN; }
 "->" { return TOKEN_PTR_OP; }
 ";"             { return ';'; }
 ("{"|"<%")      { return '{'; }
 ("}"|"%>")      { return '}'; }
@@ -266,8 +270,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 %%
 /*sizeof { return TOKEN_SIZEOF; }*/
 /*"->" { return TOKEN_PTR_OP; }*/
 /*short { return TOKEN_SHORT; }*/
 /*long { return TOKEN_LONG; }*/
 /*signed { return TOKEN_SIGNED; }*/
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -40,6 +40,7 @@
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::VoidType = NULL;
 LLVM_TYPE_CONST llvm::PointerType *LLVMTypes::VoidPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::PointerIntType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::BoolType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int8Type = NULL;
@@ -74,7 +75,7 @@ LLVM_TYPE_CONST llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 LLVM_TYPE_CONST llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
-LLVM_TYPE_CONST llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
+LLVM_TYPE_CONST llvm::VectorType *LLVMTypes::VoidPointerVectorType = NULL;
 llvm::Constant *LLVMTrue = NULL;
 llvm::Constant *LLVMFalse = NULL;
@@ -86,6 +87,8 @@ void
 InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx);
    LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0);
    LLVMTypes::PointerIntType = target.is32Bit ? llvm::Type::getInt32Ty(*ctx) :
        llvm::Type::getInt64Ty(*ctx);
    LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx);
    LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx);
@@ -130,8 +133,8 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
-    LLVMTypes::VoidPointerVectorType = 
+    LLVMTypes::VoidPointerVectorType = g->target.is32Bit ? LLVMTypes::Int32VectorType :
-        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
+        LLVMTypes::Int64VectorType;
    LLVMTrue = llvm::ConstantInt::getTrue(*ctx);
    LLVMFalse = llvm::ConstantInt::getFalse(*ctx);
@@ -451,11 +454,3 @@ LLVMBoolVector(const bool *bvec) {
    }
    return llvm::ConstantVector::get(vals);
 }
 LLVM_TYPE_CONST llvm::ArrayType *
 LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t) {
    // NOTE: ArrayType, not VectorType
    return llvm::ArrayType::get(llvm::PointerType::get(t, 0), 
                                g->target.vectorWidth);
 }
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -52,6 +52,7 @@
 struct LLVMTypes {
    static LLVM_TYPE_CONST llvm::Type *VoidType;
    static LLVM_TYPE_CONST llvm::PointerType *VoidPointerType;
    static LLVM_TYPE_CONST llvm::Type *PointerIntType;
    static LLVM_TYPE_CONST llvm::Type *BoolType;
    static LLVM_TYPE_CONST llvm::Type *Int8Type;
@@ -86,7 +87,7 @@ struct LLVMTypes {
    static LLVM_TYPE_CONST llvm::Type *FloatVectorPointerType;
    static LLVM_TYPE_CONST llvm::Type *DoubleVectorPointerType;
-    static LLVM_TYPE_CONST llvm::ArrayType *VoidPointerVectorType;
+    static LLVM_TYPE_CONST llvm::VectorType *VoidPointerVectorType;
 };
 /** These variables hold the corresponding LLVM constant values as a
@@ -204,10 +205,4 @@ extern llvm::Constant *LLVMMaskAllOn;
 /** LLVM constant value representing an 'all off' SIMD lane mask */
 extern llvm::Constant *LLVMMaskAllOff;
 /** Given an LLVM type, returns the corresponding type for a vector of
    pointers to that type.  (In practice, an array of pointers, since LLVM
    prohibits vectors of pointers.
 */
 extern LLVM_TYPE_CONST llvm::ArrayType *LLVMPointerVectorType(LLVM_TYPE_CONST llvm::Type *t);
 #endif // ISPC_LLVMUTIL_H
--- a/main.cpp
+++ b/main.cpp
@@ -83,6 +83,7 @@ static void usage(int ret) {
    printf("    [-o <name>/--outfile=<name>]\tOutput filename (may be \"-\" for standard output)\n");
    printf("    [-O0/-O1]\t\t\t\tSet optimization level (-O1 is default)\n");
    printf("    [--opt=<option>]\t\t\tSet optimization option\n");
    printf("        32-bit-addressing\t\tUse 32-bit math for addressing calculations even on 64-bit targets.\n");
    printf("        disable-assertions\t\tRemove assertion statements from final code.\n");
    printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
    printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
@@ -248,6 +249,8 @@ int main(int Argc, char *Argv[]) {
                g->opt.fastMath = true;
            else if (!strcmp(opt, "fast-masked-vload"))
                g->opt.fastMaskedVload = true;
            else if (!strcmp(opt, "32-bit-addressing"))
                g->opt.force32BitAddressing = true;
            else if (!strcmp(opt, "disable-assertions"))
                g->opt.disableAsserts = true;
            else if (!strcmp(opt, "disable-loop-unroll"))
--- a/module.cpp
+++ b/module.cpp
@@ -250,6 +250,8 @@ Module::AddGlobalVariable(Symbol *sym, Expr *initExpr, bool isConst) {
    }
    LLVM_TYPE_CONST llvm::Type *llvmType = sym->type->LLVMType(g->ctx);
    if (llvmType == NULL)
        return;
    // See if we have an initializer expression for the global.  If so,
    // make sure it's a compile-time constant!
@@ -365,12 +367,12 @@ lCheckForVaryingParameter(const Type *type, const std::string &name,
 */
 static void
 lCheckForStructParameters(const FunctionType *ftype, SourcePos pos) {
-    const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
+    for (int i = 0; i < ftype->GetNumParameters(); ++i) {
-    for (unsigned int i = 0; i < argTypes.size(); ++i) {
+        const Type *type = ftype->GetParameterType(i);
        const Type *type = argTypes[i];
        if (dynamic_cast<const StructType *>(type) != NULL) {
-            Error(pos, "Passing structs to/from application functions is currently broken. "
+            Error(pos, "Passing structs to/from application functions is "
-                  "Use a reference or const reference instead for now.");
+                  "currently broken. Use a pointer or const pointer to the "
                  "struct instead for now.");
            return;
        }
    }
@@ -483,27 +485,32 @@ Module::AddFunctionDeclaration(Symbol *funSym, bool isInline) {
    bool seenDefaultArg = false;
    int nArgs = functionType->GetNumParameters();
    for (int i = 0; i < nArgs; ++i) {
-        const Type *argType = (functionType->GetArgumentTypes())[i];
+        const Type *argType = functionType->GetParameterType(i);
-        const std::string &argName = functionType->GetArgumentName(i);
+        const std::string &argName = functionType->GetParameterName(i);
-        ConstExpr *defaultValue = (functionType->GetArgumentDefaults())[i];
+        ConstExpr *defaultValue = functionType->GetParameterDefault(i);
-        const SourcePos &argPos = (functionType->GetArgumentSourcePos())[i];
+        const SourcePos &argPos = functionType->GetParameterSourcePos(i);
        // If the function is exported, make sure that the parameter
        // doesn't have any varying stuff going on in it.
        if (funSym->storageClass == SC_EXPORT)
            lCheckForVaryingParameter(argType, argName, argPos);
-        // ISPC assumes that all memory passed in is aligned to the native
+        // ISPC assumes that no pointers alias.  (It should be possible to
        // width and that no pointers alias.  (It should be possible to
        // specify when this is not the case, but this should be the
-        // default.)  Set parameter attributes accordingly.
+        // default.)  Set parameter attributes accordingly.  (Only for
        // uniform pointers, since varying pointers are int vectors...)
        if (!functionType->isTask && 
-            dynamic_cast<const ReferenceType *>(argType) != NULL) {
+            ((dynamic_cast<const PointerType *>(argType) != NULL &&
              argType->IsUniformType()) ||
             dynamic_cast<const ReferenceType *>(argType) != NULL)) {
            // NOTE: LLVM indexes function parameters starting from 1.
            // This is unintuitive.
            function->setDoesNotAlias(i+1, true);
 #if 0
            int align = 4 * RoundUpPow2(g->target.nativeVectorWidth);
            function->addAttribute(i+1, llvm::Attribute::constructAlignmentFromInt(align));
 #endif
        }
        if (symbolTable->LookupFunction(argName.c_str()) != NULL)
@@ -887,6 +894,9 @@ lGetExportedTypes(const Type *type,
    if (dynamic_cast<const ReferenceType *>(type) != NULL)
        lGetExportedTypes(type->GetReferenceTarget(), exportedStructTypes, 
                          exportedEnumTypes, exportedVectorTypes);
    else if (dynamic_cast<const PointerType *>(type) != NULL)
        lGetExportedTypes(type->GetBaseType(), exportedStructTypes,
                          exportedEnumTypes, exportedVectorTypes);
    else if (arrayType != NULL)
        lGetExportedTypes(arrayType->GetElementType(), exportedStructTypes, 
                          exportedEnumTypes, exportedVectorTypes);
@@ -920,9 +930,8 @@ lGetExportedParamTypes(const std::vector<Symbol *> &funcs,
                          exportedEnumTypes, exportedVectorTypes);
        // And now the parameter types...
-        const std::vector<const Type *> &argTypes = ftype->GetArgumentTypes();
+        for (int j = 0; j < ftype->GetNumParameters(); ++j)
-        for (unsigned int j = 0; j < argTypes.size(); ++j)
+            lGetExportedTypes(ftype->GetParameterType(j), exportedStructTypes,
            lGetExportedTypes(argTypes[j], exportedStructTypes,
                              exportedEnumTypes, exportedVectorTypes);
    }
 }
--- a/opt.cpp
+++ b/opt.cpp
--- a/parse.yy
+++ b/parse.yy
@@ -104,14 +104,14 @@ static const char *lBuiltinTokens[] = {
    "cif", "cwhile", "const", "continue", "creturn", "default", "do", "double", 
    "else", "enum", "export", "extern", "false", "float", "for", "goto", "if",
    "inline", "int", "int8", "int16", "int32", "int64", "launch", "NULL",
-    "print", "reference", "return",
+    "print", "return", "sizeof",
    "static", "struct", "switch", "sync", "task", "true", "typedef", "uniform",
    "unsigned", "varying", "void", "while", NULL 
 };
 static const char *lParamListTokens[] = {
    "bool", "const", "double", "enum", "false", "float", "int",
-    "int8", "int16", "int32", "int64", "reference", "struct", "true",
+    "int8", "int16", "int32", "int64", "struct", "true",
    "uniform", "unsigned", "varying", "void", NULL 
 };
@@ -152,12 +152,13 @@ static const char *lParamListTokens[] = {
 %token TOKEN_AND_OP TOKEN_OR_OP TOKEN_MUL_ASSIGN TOKEN_DIV_ASSIGN TOKEN_MOD_ASSIGN 
 %token TOKEN_ADD_ASSIGN TOKEN_SUB_ASSIGN TOKEN_LEFT_ASSIGN TOKEN_RIGHT_ASSIGN 
 %token TOKEN_AND_ASSIGN TOKEN_OR_ASSIGN TOKEN_XOR_ASSIGN
 %token TOKEN_SIZEOF
 %token TOKEN_EXTERN TOKEN_EXPORT TOKEN_STATIC TOKEN_INLINE TOKEN_TASK 
 %token TOKEN_UNIFORM TOKEN_VARYING TOKEN_TYPEDEF TOKEN_SOA
 %token TOKEN_CHAR TOKEN_INT TOKEN_UNSIGNED TOKEN_FLOAT TOKEN_DOUBLE
 %token TOKEN_INT8 TOKEN_INT16 TOKEN_INT64 TOKEN_CONST TOKEN_VOID TOKEN_BOOL 
-%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE TOKEN_REFERENCE
+%token TOKEN_ENUM TOKEN_STRUCT TOKEN_TRUE TOKEN_FALSE
 %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH
 %token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH
@@ -183,7 +184,8 @@ static const char *lParamListTokens[] = {
 %type <declaration> declaration parameter_declaration
 %type <declarators> init_declarator_list 
 %type <declarationList> parameter_list parameter_type_list
-%type <declarator> declarator pointer init_declarator direct_declarator struct_declarator
+%type <declarator> declarator pointer reference
 %type <declarator> init_declarator direct_declarator struct_declarator
 %type <declarator> abstract_declarator direct_abstract_declarator
 %type <structDeclaratorList> struct_declarator_list
@@ -289,10 +291,9 @@ postfix_expression
      { $$ = new FunctionCallExpr($1, $3, Union(@1,@4)); }
    | launch_expression
    | postfix_expression '.' TOKEN_IDENTIFIER
-      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3); }
+      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3, false); }
-/*    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
+    | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER
-      { UNIMPLEMENTED }
+      { $$ = MemberExpr::create($1, yytext, Union(@1,@3), @3, true); }
 */
    | postfix_expression TOKEN_INC_OP
      { $$ = new UnaryExpr(UnaryExpr::PostInc, $1, Union(@1,@2)); }
    | postfix_expression TOKEN_DEC_OP
@@ -317,6 +318,10 @@ unary_expression
      { $$ = new UnaryExpr(UnaryExpr::PreInc, $2, Union(@1, @2)); }
    | TOKEN_DEC_OP unary_expression   
      { $$ = new UnaryExpr(UnaryExpr::PreDec, $2, Union(@1, @2)); }
    | '&' unary_expression
      { $$ = new AddressOfExpr($2, Union(@1, @2)); }
    | '*' unary_expression
      { $$ = new DereferenceExpr($2, Union(@1, @2)); }
    | '+' cast_expression 
      { $$ = $2; }
    | '-' cast_expression 
@@ -325,6 +330,10 @@ unary_expression
      { $$ = new UnaryExpr(UnaryExpr::BitNot, $2, Union(@1, @2)); }
    | '!' cast_expression 
      { $$ = new UnaryExpr(UnaryExpr::LogicalNot, $2, Union(@1, @2)); }
    | TOKEN_SIZEOF unary_expression
      { $$ = new SizeOfExpr($2, Union(@1, @2)); }
    | TOKEN_SIZEOF '(' type_name ')'
      { $$ = new SizeOfExpr($3, Union(@1, @4)); }
    ;
 cast_expression
@@ -711,8 +720,6 @@ specifier_qualifier_list
                $$ = $2->GetAsUniformType();
            else if ($1 == TYPEQUAL_VARYING)
                $$ = $2->GetAsVaryingType();
            else if ($1 == TYPEQUAL_REFERENCE)
                $$ = new ReferenceType($2, false);
            else if ($1 == TYPEQUAL_CONST)
                $$ = $2->GetAsConstType();
            else if ($1 == TYPEQUAL_UNSIGNED) {
@@ -860,7 +867,6 @@ type_qualifier
    | TOKEN_VARYING    { $$ = TYPEQUAL_VARYING; }
    | TOKEN_TASK       { $$ = TYPEQUAL_TASK; }
    | TOKEN_INLINE     { $$ = TYPEQUAL_INLINE; }
    | TOKEN_REFERENCE  { $$ = TYPEQUAL_REFERENCE; }
    | TOKEN_UNSIGNED   { $$ = TYPEQUAL_UNSIGNED; }
    ;
@@ -884,6 +890,14 @@ declarator
        tail->child = $2;
        $$ = $1;
    }
    | reference direct_declarator
    {
        Declarator *tail = $1;
        while (tail->child != NULL)
           tail = tail->child;
        tail->child = $2;
        $$ = $1;
    }
    | direct_declarator
    ;
@@ -930,7 +944,7 @@ direct_declarator
          if ($1 != NULL) {
              Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
              d->child = $1;
-              d->functionArgs = *$3;
+              if ($3 != NULL) d->functionParams = *$3;
              $$ = d;
          }
          else
@@ -976,6 +990,14 @@ pointer
    ;
 reference
    : '&' 
    {
        $$ = new Declarator(DK_REFERENCE, @1); 
    }
    ;
 parameter_type_list
    : parameter_list { $$ = $1; }
    ;
@@ -1067,6 +1089,17 @@ abstract_declarator
          d->child = $2;
          $$ = d;
      }
    | reference
      {
          Declarator *d = new Declarator(DK_REFERENCE, @1);
          $$ = d;
      }
    | reference direct_abstract_declarator
      {
          Declarator *d = new Declarator(DK_REFERENCE, Union(@1, @2));
          d->child = $2;
          $$ = d;
      }
    ;
 direct_abstract_declarator
@@ -1113,7 +1146,7 @@ direct_abstract_declarator
    | '(' parameter_type_list ')'
      {
          Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @3));
-          d->functionArgs = *$2;
+          if ($2 != NULL) d->functionParams = *$2;
      }
    | direct_abstract_declarator '(' ')'
      {
@@ -1125,7 +1158,7 @@ direct_abstract_declarator
      {
          Declarator *d = new Declarator(DK_FUNCTION, Union(@1, @4));
          d->child = $1;
-          d->functionArgs = *$3;
+          if ($3 != NULL) d->functionParams = *$3;
          $$ = d;
      }
    ;
@@ -1370,9 +1403,9 @@ function_definition
    } 
    compound_statement
    {
        Symbol *sym;
        std::vector<Symbol *> args;
-        $2->GetFunctionInfo($1, &sym, &args);
+        Symbol *sym = $2->GetFunctionInfo($1, &args);
        if (sym != NULL)
            m->AddFunctionDefinition(sym, args, $4);
        m->symbolTable->PopScope(); // push in lAddFunctionParams();
    }
@@ -1397,14 +1430,12 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
    if (ds->storageClass == SC_TYPEDEF)
        m->AddTypeDef(decl->GetSymbol());
-    else if (decl->kind == DK_FUNCTION) {
+    else {
        // function declaration
        const Type *t = decl->GetType(ds);
        if (t == NULL)
            return;
        const FunctionType *ft = dynamic_cast<const FunctionType *>(t);
-        assert(ft != NULL);
+        if (ft != NULL) {
            Symbol *funSym = decl->GetSymbol();
            assert(funSym != NULL);
            funSym->type = ft;
@@ -1416,6 +1447,7 @@ lAddDeclaration(DeclSpecs *ds, Declarator *decl) {
        else
            m->AddGlobalVariable(decl->GetSymbol(), decl->initExpr,
                                 (ds->typeQualifiers & TYPEQUAL_CONST) != 0);
    }
 }
@@ -1426,9 +1458,14 @@ static void
 lAddFunctionParams(Declarator *decl) {
    m->symbolTable->PushScope();
-    // wire up arguments
+    // walk down to the declarator for the function itself 
-    for (unsigned int i = 0; i < decl->functionArgs.size(); ++i) {
+    while (decl->kind != DK_FUNCTION && decl->child != NULL)
-        Declaration *pdecl = decl->functionArgs[i];
+        decl = decl->child;
    assert(decl->kind == DK_FUNCTION);
    // now loop over its parameters and add them to the symbol table
    for (unsigned int i = 0; i < decl->functionParams.size(); ++i) {
        Declaration *pdecl = decl->functionParams[i];
        if (pdecl == NULL)
            continue;
        assert(pdecl->declarators.size() == 1);
--- a/run_tests.py
+++ b/run_tests.py
@@ -40,7 +40,8 @@ parser.add_option('-o', '--no-opt', dest='no_opt', help='Disable optimization',
 # if no specific test files are specified, run all of the tests in tests/
 # and failing_tests/
 if len(args) == 0:
-    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc")
+    files = glob.glob("tests/*ispc") + glob.glob("failing_tests/*ispc") + \
        glob.glob("tests_errors/*ispc")
 else:
    files = args
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -319,85 +319,89 @@ static inline uniform int lanemask() {
 // AOS/SOA conversion
 static inline void
-aos_to_soa3(uniform float a[], uniform int offset, reference float v0,
+aos_to_soa3(uniform float a[], uniform int offset, float * uniform v0,
-            reference float v1, reference float v2) {
+            float * uniform v1, float * uniform v2) {
-    __aos_to_soa3_float(a, offset, v0, v1, v2);
+    __aos_to_soa3_float(&a[0], offset, v0, v1, v2);
 }
 static inline void
 soa_to_aos3(float v0, float v1, float v2, uniform float a[], 
            uniform int offset) {
-    __soa_to_aos3_float(v0, v1, v2, a, offset);
+    __soa_to_aos3_float(v0, v1, v2, &a[0], offset);
 }
 static inline void
-aos_to_soa4(uniform float a[], uniform int offset, reference float v0,
+aos_to_soa4(uniform float a[], uniform int offset, float * uniform v0,
-            reference float v1, reference float v2, reference float v3) {
+            float * uniform v1, float * uniform v2, float * uniform v3) {
-    __aos_to_soa4_float(a, offset, v0, v1, v2, v3);
+    __aos_to_soa4_float(&a[0], offset, v0, v1, v2, v3);
 }
 static inline void
 soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[], 
            uniform int offset) {
-    __soa_to_aos4_float(v0, v1, v2, v3, a, offset);
+    __soa_to_aos4_float(v0, v1, v2, v3, &a[0], offset);
 }
 static inline void
-aos_to_soa3(uniform int32 a[], uniform int offset, reference int32 v0,
+aos_to_soa3(uniform int32 a[], uniform int offset, int32 * uniform v0,
-            reference int32 v1, reference int32 v2) {
+            int32 * uniform v1, int32 * uniform v2) {
-    __aos_to_soa3_int32(a, offset, v0, v1, v2);
+    __aos_to_soa3_int32(&a[0], offset, v0, v1, v2);
 }
 static inline void
 soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[], 
            uniform int offset) {
-    __soa_to_aos3_int32(v0, v1, v2, a, offset);
+    __soa_to_aos3_int32(v0, v1, v2, &a[0], offset);
 }
 static inline void
-aos_to_soa4(uniform int32 a[], uniform int offset, reference int32 v0,
+aos_to_soa4(uniform int32 a[], uniform int offset, int32 * uniform v0,
-            reference int32 v1, reference int32 v2, reference int32 v3) {
+            int32 * uniform v1, int32 * uniform v2, int32 * uniform v3) {
-    __aos_to_soa4_int32(a, offset, v0, v1, v2, v3);
+    __aos_to_soa4_int32(&a[0], offset, v0, v1, v2, v3);
 }
 static inline void
 soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[], 
            uniform int offset) {
-    __soa_to_aos4_int32(v0, v1, v2, v3, a, offset);
+    __soa_to_aos4_int32(v0, v1, v2, v3, &a[0], offset);
 }
 ///////////////////////////////////////////////////////////////////////////
 // Prefetching
-#define PREFETCHES(NAME, TYPE)                                  \
+static inline void prefetch_l1(const void * uniform ptr) {
-static inline void prefetch_l1(const reference TYPE ptr) {      \
+    __prefetch_read_uniform_1((uniform int8 * uniform)ptr);
    __prefetch_read_1_##NAME##_refsconst(ptr);                  \
 }                                                               \
 static inline void prefetch_l2(const reference TYPE ptr) {      \
    __prefetch_read_2_##NAME##_refsconst(ptr);                  \
 }                                                               \
 static inline void prefetch_l3(const reference TYPE ptr) {      \
    __prefetch_read_3_##NAME##_refsconst(ptr);                  \
 }                                                               \
 static inline void prefetch_nt(const reference TYPE ptr) {     \
     __prefetch_read_nt_##NAME##_refsconst(ptr);                \
 }
-PREFETCHES(uniform_int8, uniform int8)
+static inline void prefetch_l2(const void * uniform ptr) {
-PREFETCHES(uniform_int16, uniform int16)
+    __prefetch_read_uniform_2((uniform int8 * uniform)ptr);
-PREFETCHES(uniform_int32, uniform int32)
+}
 PREFETCHES(uniform_int64, uniform int64)
 PREFETCHES(uniform_float, uniform float)
 PREFETCHES(uniform_double, uniform double)
-PREFETCHES(varying_int8, int8)
+static inline void prefetch_l3(const void * uniform ptr) {
-PREFETCHES(varying_int16, int16)
+    __prefetch_read_uniform_3((uniform int8 * uniform)ptr);
-PREFETCHES(varying_int32, int32)
+}
 PREFETCHES(varying_int64, int64)
 PREFETCHES(varying_float, float)
 PREFETCHES(varying_double, double)
-#undef PREFETCHES
+static inline void prefetch_nt(const void * uniform ptr) {
     __prefetch_read_uniform_nt((uniform int8 * uniform)ptr);
 }
 #if 0
 static inline void prefetch_l1(const void * varying ptr) {
    __prefetch_read_varying_1((varying int8 * varying)ptr);
 }
 static inline void prefetch_l2(const void * varying ptr) {
    __prefetch_read_varying_2((varying int8 * varying)ptr);
 }
 static inline void prefetch_l3(const void * varying ptr) {
    __prefetch_read_varying_3((varying int8 * varying)ptr);
 }
 static inline void prefetch_nt(const void * varying ptr) {
     __prefetch_read_varying_nt((varying int8 * varying)ptr);
 }
 #endif
 ///////////////////////////////////////////////////////////////////////////
 // Horizontal ops / reductions
@@ -525,9 +529,9 @@ static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
 #define REDUCE_EQUAL(TYPE, FUNCTYPE, MASKTYPE)                     \
 static inline uniform bool reduce_equal(TYPE v) {                  \
    uniform TYPE unusedValue;                                      \
-    return __reduce_equal_##FUNCTYPE(v, unusedValue, (MASKTYPE)__mask); \
+    return __reduce_equal_##FUNCTYPE(v, &unusedValue, (MASKTYPE)__mask); \
 }                                                                  \
-static inline uniform bool reduce_equal(TYPE v, reference uniform TYPE value) { \
+static inline uniform bool reduce_equal(TYPE v, uniform TYPE * uniform value) { \
    return __reduce_equal_##FUNCTYPE(v, value, (MASKTYPE)__mask);       \
 }
@@ -599,26 +603,26 @@ static unsigned int64 exclusive_scan_or(unsigned int64 v) {
 static inline uniform int 
 packed_load_active(uniform unsigned int a[], uniform int start,
-                   reference unsigned int vals) {
+                   unsigned int * uniform vals) {
-    return __packed_load_active(a, (unsigned int)start, vals,
+    return __packed_load_active(&a[0], (unsigned int)start, vals,
                                (unsigned int32)__mask);
 }
 static inline uniform int
 packed_store_active(uniform unsigned int a[], uniform int start,
                    unsigned int vals) {
-    return __packed_store_active(a, (unsigned int)start, vals,
+    return __packed_store_active(&a[0], (unsigned int)start, vals,
                                 (unsigned int32)__mask);
 }
 static inline uniform int packed_load_active(uniform int a[], uniform int start,
-                                             reference int vals) {
+                                             int * uniform vals) {
-    return __packed_load_active(a, start, vals, (int32)__mask);
+    return __packed_load_active(&a[0], start, vals, (int32)__mask);
 }
 static inline uniform int packed_store_active(uniform int a[], uniform int start,
                                              int vals) {
-    return __packed_store_active(a, start, vals, (int32)__mask);
+    return __packed_store_active(&a[0], start, vals, (int32)__mask);
 }
 ///////////////////////////////////////////////////////////////////////////
@@ -636,35 +640,35 @@ static inline void memory_barrier() {
 }
 #define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE)                        \
-static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    memory_barrier();                                                   \
-    TA ret = __atomic_##OPB##_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
 #define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB, MASKTYPE)                \
-static inline TA atomic_##OPA##_global(uniform reference TA ref, TA value) { \
+static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
    uniform TA oneval = reduce_##OPA(value);                            \
    TA ret;                                                             \
    if (lanemask() != 0) {                                              \
        memory_barrier();                                               \
-        ret = __atomic_##OPB##_uniform_##TB##_global(ref, oneval, (MASKTYPE)__mask); \
+        ret = __atomic_##OPB##_uniform_##TB##_global(ptr, oneval, (MASKTYPE)__mask); \
        memory_barrier();                                               \
    }                                                                   \
    return ret;                                                         \
 }                                                                       \
-static inline uniform TA atomic_##OPA##_global(uniform reference TA ref, \
+static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
                                               uniform TA value) {      \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ref, value, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_##OPB##_uniform_##TB##_global(ptr, value, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
@@ -717,16 +721,16 @@ DEFINE_ATOMIC_OP(double,double,swap,swap,int32)
 #define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE)                           \
 static inline TA atomic_compare_exchange_global(                           \
-         uniform reference TA ref, TA oldval, TA newval) {                 \
+         uniform TA * uniform ptr, TA oldval, TA newval) {                 \
    memory_barrier();                                                      \
-    TA ret = __atomic_compare_exchange_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    TA ret = __atomic_compare_exchange_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                      \
    return ret;                                                            \
 } \
 static inline uniform TA atomic_compare_exchange_global(               \
-         uniform reference TA ref, uniform TA oldval, uniform TA newval) {                 \
+         uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) {                 \
    memory_barrier();                                                   \
-    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ref, oldval, newval, (MASKTYPE)__mask); \
+    uniform TA ret = __atomic_compare_exchange_uniform_##TB##_global(ptr, oldval, newval, (MASKTYPE)__mask); \
    memory_barrier();                                                   \
    return ret;                                                         \
 }
@@ -1162,22 +1166,22 @@ static inline uniform float ldexp(uniform float x, uniform int n) {
    return floatbits(ix);
 }
-static inline float frexp(float x, reference int pw2) {
+static inline float frexp(float x, int * uniform pw2) {
    unsigned int ex = 0x7F800000u;              // exponent mask
    unsigned int ix = intbits(x);
    ex &= ix;
    ix &= ~0x7F800000u;  // clear exponent
-    pw2 = (int)(ex >> 23) - 126; // compute exponent
+    *pw2 = (int)(ex >> 23) - 126; // compute exponent
    ix |= 0x3F000000u;         // insert exponent +1 in x
    return floatbits(ix);
 }
-static inline uniform float frexp(uniform float x, reference uniform int pw2) {
+static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
    uniform unsigned int ex = 0x7F800000u;              // exponent mask
    uniform unsigned int ix = intbits(x);
    ex &= ix;
    ix &= ~0x7F800000u;  // clear exponent
-    pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
+    *pw2 = (uniform int)(ex >> 23) - 126; // compute exponent
    ix |= 0x3F000000u;         // insert exponent +1 in x
    return floatbits(ix);
 }
@@ -1441,7 +1445,8 @@ static inline uniform float cos(uniform float x_full) {
 }
-static inline void sincos(float x_full, reference float sin_result, reference float cos_result) {
+static inline void sincos(float x_full, float * uniform sin_result, 
                          float * uniform cos_result) {
    if (__math_lib == __math_lib_svml) {
        __svml_sincos(x_full, sin_result, cos_result);
    }
@@ -1451,9 +1456,9 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
            if ((mask & (1 << i)) == 0)
                continue;
            uniform float s, c;
-            __stdlib_sincosf(extract(x_full, i), s, c);
+            __stdlib_sincosf(extract(x_full, i), &s, &c);
-            sin_result = insert(sin_result, i, s);
+            *sin_result = insert(*sin_result, i, s);
-            cos_result = insert(cos_result, i, c);
+            *cos_result = insert(*cos_result, i, c);
        }
    }
    else if (__math_lib == __math_lib_ispc || 
@@ -1503,17 +1508,17 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
        sin_formula *= x;
-        sin_result = sin_usecos ? cos_formula : sin_formula;
+        *sin_result = sin_usecos ? cos_formula : sin_formula;
-        cos_result = cos_usecos ? cos_formula : sin_formula;
+        *cos_result = cos_usecos ? cos_formula : sin_formula;
-        sin_result = sin_flipsign ? -sin_result : sin_result;
+        *sin_result = sin_flipsign ? -*sin_result : *sin_result;
-        cos_result = cos_flipsign ? -cos_result : cos_result;
+        *cos_result = cos_flipsign ? -*cos_result : *cos_result;
    }
 }
-static inline void sincos(uniform float x_full, reference uniform float sin_result,
+static inline void sincos(uniform float x_full, uniform float * uniform sin_result,
-                          reference uniform float cos_result) {
+                          uniform float * uniform cos_result) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
        __stdlib_sincosf(x_full, sin_result, cos_result);
@@ -1565,11 +1570,11 @@ static inline void sincos(uniform float x_full, reference uniform float sin_resu
        sin_formula *= x;
-        sin_result = sin_usecos ? cos_formula : sin_formula;
+        *sin_result = sin_usecos ? cos_formula : sin_formula;
-        cos_result = cos_usecos ? cos_formula : sin_formula;
+        *cos_result = cos_usecos ? cos_formula : sin_formula;
-        sin_result = sin_flipsign ? -sin_result : sin_result;
+        *sin_result = sin_flipsign ? -*sin_result : *sin_result;
-        cos_result = cos_flipsign ? -cos_result : cos_result;
+        *cos_result = cos_flipsign ? -*cos_result : *cos_result;
    }
 }
@@ -2038,7 +2043,8 @@ static inline uniform float exp(uniform float x_full) {
 // Range reduction for logarithms takes log(x) -> log(2^n * y) -> n
 // * log(2) + log(y) where y is the reduced range (usually in [1/2,
 // 1)).
-static inline void __range_reduce_log(float input, reference float reduced, reference int exponent) {
+static inline void __range_reduce_log(float input, float * uniform reduced, 
                                      int * uniform exponent) {
    int int_version = intbits(input);
    // single precision = SEEE EEEE EMMM MMMM MMMM MMMM MMMM MMMM
    // exponent mask    = 0111 1111 1000 0000 0000 0000 0000 0000
@@ -2057,28 +2063,28 @@ static inline void __range_reduce_log(float input, reference float reduced, refe
    int biased_exponent = int_version >> 23; // This number is [0, 255] but it means [-127, 128]
    int offset_exponent = biased_exponent + 1; // Treat the number as if it were 2^{e+1} * (1.m)/2
-    exponent = offset_exponent - 127; // get the real value
+    *exponent = offset_exponent - 127; // get the real value
    // Blend the offset_exponent with the original input (do this in
    // int for now, until I decide if float can have & and &not)
    int blended = (int_version & nonexponent_mask) | (exponent_neg1);
-    reduced = floatbits(blended);
+    *reduced = floatbits(blended);
 }
-static inline void __range_reduce_log(uniform float input, reference uniform float reduced, 
+static inline void __range_reduce_log(uniform float input, uniform float * uniform reduced, 
-                                      reference uniform int exponent) {
+                                      uniform int * uniform exponent) {
    uniform int int_version = intbits(input);
    static const uniform int nonexponent_mask = 0x807FFFFF;
    static const uniform int exponent_neg1 = (126 << 23);
    uniform int biased_exponent = int_version >> 23;
    uniform int offset_exponent = biased_exponent + 1;
-    exponent = offset_exponent - 127; // get the real value
+    *exponent = offset_exponent - 127; // get the real value
    uniform int blended = (int_version & nonexponent_mask) | (exponent_neg1);
-    reduced = floatbits(blended);
+    *reduced = floatbits(blended);
 }
@@ -2099,7 +2105,7 @@ static inline float log(float x_full) {
    }
    else if (__math_lib == __math_lib_ispc_fast) {
        int e;
-        x_full = frexp(x_full, e);
+        x_full = frexp(x_full, &e);
        int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
        e += x_smaller_SQRTHF;
@@ -2139,7 +2145,7 @@ static inline float log(float x_full) {
        const float one = 1.0;
        float patched = exceptional ? one : x_full;
-        __range_reduce_log(patched, reduced, exponent);
+        __range_reduce_log(patched, &reduced, &exponent);
        const float ln2 = 0.693147182464599609375;
@@ -2179,7 +2185,7 @@ static inline uniform float log(uniform float x_full) {
    }
    else if (__math_lib == __math_lib_ispc_fast) {
        uniform int e;
-        x_full = frexp(x_full, e);
+        x_full = frexp(x_full, &e);
        uniform int x_smaller_SQRTHF = (0.707106781186547524f > x_full) ? 0xffffffff : 0;
        e += x_smaller_SQRTHF;
@@ -2219,7 +2225,7 @@ static inline uniform float log(uniform float x_full) {
        const uniform float one = 1.0;
        uniform float patched = exceptional ? one : x_full;
-        __range_reduce_log(patched, reduced, exponent);
+        __range_reduce_log(patched, &reduced, &exponent);
        const uniform float ln2 = 0.693147182464599609375;
@@ -2315,22 +2321,22 @@ static inline uniform double ldexp(uniform double x, uniform int n) {
    return doublebits(ix);
 }
-static inline double frexp(double x, reference int pw2) {
+static inline double frexp(double x, int * uniform pw2) {
    unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
    unsigned int64 ix = intbits(x);
    ex &= ix;
    ix &= ~0x7ff0000000000000;  // clear exponent
-    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    *pw2 = (int)(ex >> 52) - 1022; // compute exponent
    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
    return doublebits(ix);
 }
-static inline uniform double frexp(uniform double x, reference uniform int pw2) {
+static inline uniform double frexp(uniform double x, uniform int * uniform pw2) {
    uniform unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
    uniform unsigned int64 ix = intbits(x);
    ex &= ix;
    ix &= ~0x7ff0000000000000;  // clear exponent
-    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    *pw2 = (int)(ex >> 52) - 1022; // compute exponent
    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
    return doublebits(ix);
 }
@@ -2381,13 +2387,13 @@ static inline uniform double cos(uniform double x) {
        return __stdlib_cos(x);
 }
-static inline void sincos(double x, reference double sin_result,
+static inline void sincos(double x, double * uniform sin_result,
-                          reference double cos_result) {
+                          double * uniform cos_result) {
    if (__math_lib == __math_lib_ispc_fast) {
        float sr, cr;
-        sincos((float)x, sr, cr);
+        sincos((float)x, &sr, &cr);
-        sin_result = sr;
+        *sin_result = sr;
-        cos_result = cr;
+        *cos_result = cr;
    }
    else {
        uniform int mask = lanemask();
@@ -2395,20 +2401,20 @@ static inline void sincos(double x, reference double sin_result,
            uniform double sr, cr;
            if ((mask & (1 << i)) == 0)
                continue;
-            __stdlib_sincos(extract(x, i), sr, cr);
+            __stdlib_sincos(extract(x, i), &sr, &cr);
-            sin_result = insert(sin_result, i, sr);
+            *sin_result = insert(*sin_result, i, sr);
-            cos_result = insert(cos_result, i, cr);
+            *cos_result = insert(*cos_result, i, cr);
        }
    }
 }
-static inline void sincos(uniform double x, reference uniform double sin_result,
+static inline void sincos(uniform double x, uniform double * uniform sin_result,
-                          reference uniform double cos_result) {
+                          uniform double * uniform cos_result) {
    if (__math_lib == __math_lib_ispc_fast) {
        uniform float sr, cr;
-        sincos((uniform float)x, sr, cr);
+        sincos((uniform float)x, &sr, &cr);
-        sin_result = sr;
+        *sin_result = sr;
-        cos_result = cr;
+        *cos_result = cr;
    }
    else
        __stdlib_sincos(x, sin_result, cos_result);
@@ -2883,63 +2889,64 @@ struct RNGState {
    unsigned int z1, z2, z3, z4;
 };
-static inline unsigned int random(reference RNGState state)
+static inline unsigned int random(RNGState * uniform state)
 {
    unsigned int b;
-    b  = ((state.z1 << 6) ^ state.z1) >> 13;
+    // FIXME: state->z1, etc..
-    state.z1 = ((state.z1 & 4294967294U) << 18) ^ b;
+    b  = (((*state).z1 << 6) ^ (*state).z1) >> 13;
-    b  = ((state.z2 << 2) ^ state.z2) >> 27; 
+    (*state).z1 = (((*state).z1 & 4294967294U) << 18) ^ b;
-    state.z2 = ((state.z2 & 4294967288U) << 2) ^ b;
+    b  = (((*state).z2 << 2) ^ (*state).z2) >> 27; 
-    b  = ((state.z3 << 13) ^ state.z3) >> 21;
+    (*state).z2 = (((*state).z2 & 4294967288U) << 2) ^ b;
-    state.z3 = ((state.z3 & 4294967280U) << 7) ^ b;
+    b  = (((*state).z3 << 13) ^ (*state).z3) >> 21;
-    b  = ((state.z4 << 3) ^ state.z4) >> 12;
+    (*state).z3 = (((*state).z3 & 4294967280U) << 7) ^ b;
-    state.z4 = ((state.z4 & 4294967168U) << 13) ^ b;
+    b  = (((*state).z4 << 3) ^ (*state).z4) >> 12;
-    return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4);
+    (*state).z4 = (((*state).z4 & 4294967168U) << 13) ^ b;
    return ((*state).z1 ^ (*state).z2 ^ (*state).z3 ^ (*state).z4);
 }
-static inline float frandom(reference RNGState state)
+static inline float frandom(RNGState * uniform state)
 {
    unsigned int irand = random(state);
    irand &= (1<<23)-1;
    return floatbits(0x3F800000 | irand)-1.0f;
 }
-static inline uniform unsigned int __seed4(reference RNGState state, 
+static inline uniform unsigned int __seed4(RNGState * uniform state, 
                                           uniform int start,
                                           uniform unsigned int seed) {
    uniform unsigned int c1 = 0xf0f0f0f0;
    uniform unsigned int c2 = 0x0f0f0f0f;
-    state.z1 = insert(state.z1, start + 0, seed);
+    (*state).z1 = insert((*state).z1, start + 0, seed);
-    state.z1 = insert(state.z1, start + 1, seed ^ c1);
+    (*state).z1 = insert((*state).z1, start + 1, seed ^ c1);
-    state.z1 = insert(state.z1, start + 2, (seed << 3) ^ c1);
+    (*state).z1 = insert((*state).z1, start + 2, (seed << 3) ^ c1);
-    state.z1 = insert(state.z1, start + 3, (seed << 2) ^ c2);
+    (*state).z1 = insert((*state).z1, start + 3, (seed << 2) ^ c2);
    seed += 131;
-    state.z2 = insert(state.z2, start + 0, seed);
+    (*state).z2 = insert((*state).z2, start + 0, seed);
-    state.z2 = insert(state.z2, start + 1, seed ^ c1);
+    (*state).z2 = insert((*state).z2, start + 1, seed ^ c1);
-    state.z2 = insert(state.z2, start + 2, (seed << 3) ^ c1);
+    (*state).z2 = insert((*state).z2, start + 2, (seed << 3) ^ c1);
-    state.z2 = insert(state.z2, start + 3, (seed << 2) ^ c2);
+    (*state).z2 = insert((*state).z2, start + 3, (seed << 2) ^ c2);
-    seed ^= extract(state.z2, 2);
+    seed ^= extract((*state).z2, 2);
-    state.z3 = insert(state.z3, start + 0, seed);
+    (*state).z3 = insert((*state).z3, start + 0, seed);
-    state.z3 = insert(state.z3, start + 1, seed ^ c1);
+    (*state).z3 = insert((*state).z3, start + 1, seed ^ c1);
-    state.z3 = insert(state.z3, start + 2, (seed << 3) ^ c1);
+    (*state).z3 = insert((*state).z3, start + 2, (seed << 3) ^ c1);
-    state.z3 = insert(state.z3, start + 3, (seed << 2) ^ c2);
+    (*state).z3 = insert((*state).z3, start + 3, (seed << 2) ^ c2);
    seed <<= 4;
    seed += 3;
-    seed ^= extract(state.z1, 3);
+    seed ^= extract((*state).z1, 3);
-    state.z4 = insert(state.z4, start + 0, seed);
+    (*state).z4 = insert((*state).z4, start + 0, seed);
-    state.z4 = insert(state.z4, start + 1, seed ^ c1);
+    (*state).z4 = insert((*state).z4, start + 1, seed ^ c1);
-    state.z4 = insert(state.z4, start + 2, (seed << 3) ^ c1);
+    (*state).z4 = insert((*state).z4, start + 2, (seed << 3) ^ c1);
-    state.z4 = insert(state.z4, start + 3, (seed << 2) ^ c2);
+    (*state).z4 = insert((*state).z4, start + 3, (seed << 2) ^ c2);
    return seed;
 }
-static inline void seed_rng(reference uniform RNGState state, uniform unsigned int seed) {
+static inline void seed_rng(uniform RNGState * uniform state, uniform unsigned int seed) {
    seed = __seed4(state, 0, seed);
    if (programCount == 8)
        __seed4(state, 4, seed ^ 0xbeeff00d);
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -131,7 +131,11 @@ lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
        // which in turn may represent an overloaded function.  So we need
        // to try to resolve the overload based on the type of the symbol
        // we're initializing here.
-        if (fse->ResolveOverloads(funcType->GetArgumentTypes()) == false)
+        std::vector<const Type *> paramTypes;
        for (int i = 0; i < funcType->GetNumParameters(); ++i)
            paramTypes.push_back(funcType->GetParameterType(i));
        if (fse->ResolveOverloads(paramTypes) == false)
            return false;
    }
    return true;
@@ -151,14 +155,9 @@ lPossiblyResolveFunctionOverloads(Expr *expr, const Type *type) {
 static void
 lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *symType,
            Expr *initExpr, FunctionEmitContext *ctx, SourcePos pos) {
-    if (initExpr == NULL) {
+    if (initExpr == NULL)
-        // Initialize things without initializers to the undefined value.
+        // leave it uninitialized
        // To auto-initialize everything to zero, replace 'UndefValue' with
        // 'NullValue' in the below
        LLVM_TYPE_CONST llvm::Type *ltype = symType->LLVMType(g->ctx);
        ctx->StoreInst(llvm::UndefValue::get(ltype), lvalue);
        return;
    }
    // If the initializer is a straight up expression that isn't an
    // ExprList, then we'll see if we can type convert it to the type of
@@ -239,7 +238,14 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *symType,
            // Initialize each element with the corresponding value from
            // the ExprList
            for (int i = 0; i < nInits; ++i) {
-                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "element");
+                llvm::Value *ep;
                if (dynamic_cast<const StructType *>(symType) != NULL)
                    ep = ctx->AddElementOffset(lvalue, i, NULL, "element");
                else
                    ep = ctx->GetElementPtrInst(lvalue, LLVMInt32(0), LLVMInt32(i), 
                                                PointerType::GetUniform(collectionType->GetElementType(i)), 
                                                "gep");
                lInitSymbol(ep, symName, collectionType->GetElementType(i), 
                            exprList->exprs[i], ctx, pos);
            }
@@ -359,9 +365,11 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
        else {
            // For non-static variables, allocate storage on the stack
            sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
            // Tell the FunctionEmitContext about the variable; must do
            // this before the initializer stuff.
            ctx->EmitVariableDebugInfo(sym);
            // And then get it initialized...
            sym->parentFunction = ctx->GetFunction();
            lInitSymbol(sym->storagePtr, sym->name.c_str(), sym->type, 
@@ -693,16 +701,22 @@ lSafeToRunWithAllLanesOff(Expr *expr) {
        // If we can determine at compile time the size of the array/vector
        // and if the indices are compile-time constants, then we may be
        // able to safely run this under a predicated if statement..
-        if (ie->arrayOrVector == NULL)
+        if (ie->baseExpr == NULL)
            return false;
-        const Type *type = ie->arrayOrVector->GetType();
+        const Type *type = ie->baseExpr->GetType();
        ConstExpr *ce = dynamic_cast<ConstExpr *>(ie->index);
        if (type == NULL || ce == NULL)
            return false;
        if (dynamic_cast<const ReferenceType *>(type) != NULL)
            type = type->GetReferenceTarget();
        const PointerType *pointerType = 
            dynamic_cast<const PointerType *>(type);
        if (pointerType != NULL)
            // pointer[offset] -> can't be sure
            return false;
        const SequentialType *seqType = 
            dynamic_cast<const SequentialType *>(type);
        assert(seqType != NULL);
@@ -740,6 +754,14 @@ lSafeToRunWithAllLanesOff(Expr *expr) {
    if ((dre = dynamic_cast<DereferenceExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(dre->expr);
    SizeOfExpr *soe;
    if ((soe = dynamic_cast<SizeOfExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(soe->expr);
    AddressOfExpr *aoe;
    if ((aoe = dynamic_cast<AddressOfExpr *>(expr)) != NULL)
        return lSafeToRunWithAllLanesOff(aoe->expr);
    if (dynamic_cast<SymbolExpr *>(expr) != NULL ||
        dynamic_cast<FunctionSymbolExpr *>(expr) != NULL ||
        dynamic_cast<SyncExpr *>(expr) != NULL ||
@@ -1822,7 +1844,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
                if (!ptr)
                    return;
-                llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, i);
+                llvm::Value *arrayPtr = ctx->AddElementOffset(argPtrArray, i, NULL);
                ctx->StoreInst(ptr, arrayPtr);
            }
        }
@@ -1830,7 +1852,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
            llvm::Value *ptr = lProcessPrintArg(values, ctx, argTypes);
            if (!ptr)
                return;
-            llvm::Value *arrayPtr = ctx->GetElementPtrInst(argPtrArray, 0, 0);
+            llvm::Value *arrayPtr = ctx->AddElementOffset(argPtrArray, 0, NULL);
            ctx->StoreInst(ptr, arrayPtr);
        }
    }
@@ -1846,7 +1868,7 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
    args[2] = LLVMInt32(g->target.vectorWidth);
    args[3] = ctx->LaneMask(mask);
    std::vector<llvm::Value *> argVec(&args[0], &args[5]);
-    ctx->CallInst(printFunc, AtomicType::Void, argVec, "");
+    ctx->CallInst(printFunc, NULL, argVec, "");
 }
@@ -1926,7 +1948,7 @@ AssertStmt::EmitCode(FunctionEmitContext *ctx) const {
    args.push_back(ctx->GetStringPtr(errorString));
    args.push_back(expr->GetValue(ctx));
    args.push_back(ctx->GetFullMask());
-    ctx->CallInst(assertFunc, AtomicType::Void, args, "");
+    ctx->CallInst(assertFunc, NULL, args, "");
 #ifndef ISPC_IS_WINDOWS
    free(errorString);
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -101,7 +101,8 @@ int main(int argc, char *argv[]) {
    assert(w <= 16);
    float returned_result[16];
-    memset(returned_result, 0, 16*sizeof(float));
+    for (int i = 0; i < 16; ++i)
        returned_result[i] = -1e20;
    float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16};
    int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 };
--- a/tests/aossoa-1.ispc
+++ b/tests/aossoa-1.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
        a[i] = i;
    float x=-1, y=-1, z=-1;
-    aos_to_soa3(a, 0, x, y, z);
+    aos_to_soa3(a, 0, &x, &y, &z);
    int errs = 0;
    if (x != width * programIndex) ++errs;
--- a/tests/aossoa-2.ispc
+++ b/tests/aossoa-2.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
        a[i] = i;
    float x=-1, y=-1, z=-1, w=-1;
-    aos_to_soa4(a, 0, x, y, z, w);
+    aos_to_soa4(a, 0, &x, &y, &z, &w);
    int errs = 0;
    if (x != width * programIndex) ++errs;
--- a/tests/aossoa-5.ispc
+++ b/tests/aossoa-5.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
        a[i] = i;
    int x=-1, y=-1, z=-1;
-    aos_to_soa3(a, 0, x, y, z);
+    aos_to_soa3(a, 0, &x, &y, &z);
    int errs = 0;
    if (x != width * programIndex) ++errs;
--- a/tests/aossoa-6.ispc
+++ b/tests/aossoa-6.ispc
@@ -11,7 +11,7 @@ export void f_v(uniform float RET[]) {
        a[i] = i;
    int x=-1, y=-1, z=-1, w=-1;
-    aos_to_soa4(a, 0, x, y, z, w);
+    aos_to_soa4(a, 0, &x, &y, &z, &w);
    int errs = 0;
    if (x != width * programIndex) ++errs;
--- a/tests/array-assignment-varying-control.ispc
+++ b/tests/array-assignment-varying-control.ispc
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
 struct Foo { float f; };
-void f(reference uniform Foo foo[], float a) {
+void f(uniform Foo foo[], float a) {
    ++foo[a].f;
 }
--- a/tests/atomics-1.ispc
+++ b/tests/atomics-1.ispc
@@ -6,7 +6,7 @@ uniform unsigned int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float delta = 1;
-    float b = atomic_add_global(s, delta);
+    float b = atomic_add_global(&s, delta);
    RET[programIndex] = reduce_add(b);
 }
--- a/tests/atomics-10.ispc
+++ b/tests/atomics-10.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float b = 0;
    float delta = 1;
    if (programIndex < 2)
-        b = atomic_add_global(s, delta);
+        b = atomic_add_global(&s, delta);
    RET[programIndex] = s;
 }
--- a/tests/atomics-11.ispc
+++ b/tests/atomics-11.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
-        b = atomic_add_global(s, programIndex);
+        b = atomic_add_global(&s, programIndex);
    RET[programIndex] = s;
 }
--- a/tests/atomics-12.ispc
+++ b/tests/atomics-12.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
    RET[programIndex] = s;
 }
--- a/tests/atomics-13.ispc
+++ b/tests/atomics-13.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
    RET[programIndex] = popcnt(reduce_max((int32)b));
 }
--- a/tests/atomics-14.ispc
+++ b/tests/atomics-14.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float b = 0;
    if (programIndex & 1)
-        b = atomic_or_global(s, (1 << programIndex));
+        b = atomic_or_global(&s, (1 << programIndex));
    RET[programIndex] = (s>>20);
 }
--- a/tests/atomics-2.ispc
+++ b/tests/atomics-2.ispc
@@ -6,7 +6,7 @@ uniform int64 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    float delta = 1;
-    float b = atomic_add_global(s, delta);
+    float b = atomic_add_global(&s, delta);
    RET[programIndex] = reduce_add(b);
 }
--- a/tests/atomics-3.ispc
+++ b/tests/atomics-3.ispc
@@ -6,7 +6,7 @@ uniform int32 s = 0xff;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
    int32 bits = 0xfffffff0;
-    float b = atomic_xor_global(s, bits);
+    float b = atomic_xor_global(&s, bits);
    RET[programIndex] = s;
 }
--- a/tests/atomics-4.ispc
+++ b/tests/atomics-4.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_or_global(s, (1<<programIndex));
+    float b = atomic_or_global(&s, (1<<programIndex));
    RET[programIndex] = s;
 }
--- a/tests/atomics-5.ispc
+++ b/tests/atomics-5.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0xbeef;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_swap_global(s, programIndex);
+    float b = atomic_swap_global(&s, programIndex);
    RET[programIndex] = reduce_max(b);
 }
--- a/tests/atomics-6.ispc
+++ b/tests/atomics-6.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 2;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    float b = atomic_compare_exchange_global(s, programIndex, a*1000);
+    float b = atomic_compare_exchange_global(&s, programIndex, a*1000);
    RET[programIndex] = s;
 }
--- a/tests/atomics-7.ispc
+++ b/tests/atomics-7.ispc
@@ -5,7 +5,7 @@ uniform int32 s = 0;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int32 a = aFOO[programIndex]; 
-    float b = atomic_min_global(s, a);
+    float b = atomic_min_global(&s, a);
    RET[programIndex] = reduce_min(b);
 }
--- a/tests/atomics-8.ispc
+++ b/tests/atomics-8.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    int32 a = aFOO[programIndex]; 
    int32 b = 0;
    if (programIndex & 1)
-        b = atomic_max_global(s, a);
+        b = atomic_max_global(&s, a);
    RET[programIndex] = s;
 }
--- a/tests/atomics-9.ispc
+++ b/tests/atomics-9.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    float b = 0;
    int32 delta = 1;
    if (programIndex < 2)
-        b = atomic_add_global(s, delta);
+        b = atomic_add_global(&s, delta);
    RET[programIndex] = reduce_add(b);
 }
--- a/tests/atomics-uniform-1.ispc
+++ b/tests/atomics-uniform-1.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 10;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_add_global(s, 1);
+    uniform unsigned int32 b = atomic_add_global(&s, 1);
    RET[programIndex] = s;
 }
--- a/tests/atomics-uniform-2.ispc
+++ b/tests/atomics-uniform-2.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0b1010;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_or_global(s, 1);
+    uniform unsigned int32 b = atomic_or_global(&s, 1);
    RET[programIndex] = s;
 }
--- a/tests/atomics-uniform-3.ispc
+++ b/tests/atomics-uniform-3.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0b1010;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_or_global(s, 1);
+    uniform unsigned int32 b = atomic_or_global(&s, 1);
    RET[programIndex] = b;
 }
--- a/tests/atomics-uniform-4.ispc
+++ b/tests/atomics-uniform-4.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0xffff;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_min_global(s, 1);
+    uniform unsigned int32 b = atomic_min_global(&s, 1);
    RET[programIndex] = b;
 }
--- a/tests/atomics-uniform-5.ispc
+++ b/tests/atomics-uniform-5.ispc
@@ -5,7 +5,7 @@ uniform unsigned int32 s = 0xffff;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform unsigned int32 b = atomic_min_global(s, 1);
+    uniform unsigned int32 b = atomic_min_global(&s, 1);
    RET[programIndex] = s;
 }
--- a/tests/atomics-uniform-6.ispc
+++ b/tests/atomics-uniform-6.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform float b = atomic_swap_global(s, 1.);
+    uniform float b = atomic_swap_global(&s, 1.);
    RET[programIndex] = s;
 }
--- a/tests/atomics-uniform-7.ispc
+++ b/tests/atomics-uniform-7.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform float b = atomic_swap_global(s, 1.);
+    uniform float b = atomic_swap_global(&s, 1.);
    RET[programIndex] = b;
 }
--- a/tests/atomics-uniform-8.ispc
+++ b/tests/atomics-uniform-8.ispc
@@ -5,7 +5,7 @@ uniform float s = 100.;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform float b = atomic_compare_exchange_global(s, 1., -100.);
+    uniform float b = atomic_compare_exchange_global(&s, 1., -100.);
    RET[programIndex] = b;
 }
--- a/tests/atomics-uniform-9.ispc
+++ b/tests/atomics-uniform-9.ispc
@@ -5,7 +5,7 @@ uniform int64 s = 100.;
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex]; 
-    uniform int64 b = atomic_compare_exchange_global(s, 100, -100);
+    uniform int64 b = atomic_compare_exchange_global(&s, 100, -100);
    RET[programIndex] = s;
 }
--- a/tests/cfor-ref-5.ispc
+++ b/tests/cfor-ref-5.ispc
@@ -3,8 +3,8 @@ export uniform int width() { return programCount; }
-void foo(reference float a) {
+void foo(float * uniform a) {
-    a = 0;
+    *a = 0;
 }
 export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
@@ -13,7 +13,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    uniform int i;
    cfor (i = 0; i < 10; ++i)
        x[i] = a*b;
-    foo(x[b]);
+    foo(&x[b]);
    RET[programIndex] = x[5] + x[9];
 }
--- a/tests/cfor-ref-6.ispc
+++ b/tests/cfor-ref-6.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
-void foo(reference float a[10]) {
+void foo(float a[10]) {
    a[5] = 0;
 }
--- a/tests/cfor-ref-7.ispc
+++ b/tests/cfor-ref-7.ispc
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
-void foo(reference float a[10]) {
+void foo(float a[10]) {
    a[5] = 0;
 }
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    if (programIndex & 1)
        a = -a;
    int exponent;
-    frexp(a, exponent);
+    frexp(a, &exponent);
    RET[programIndex] = exponent;
 }
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    if (programIndex & 1)
        a = -a;
    int exponent;
-    RET[programIndex] = frexp(a, exponent);
+    RET[programIndex] = frexp(a, &exponent);
 }
 export void result(uniform float RET[]) {
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    if (programIndex & 1)
        a = -a;
    int exponent;
-    frexp(a, exponent);
+    frexp(a, &exponent);
    RET[programIndex] = exponent;
 }
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    if (programIndex & 1)
        a = -a;
    int exponent;
-    RET[programIndex] = frexp(a, exponent);
+    RET[programIndex] = frexp(a, &exponent);
 }
 export void result(uniform float RET[]) {
--- a/tests/funcptr-null-3.ispc
+++ b/tests/funcptr-null-3.ispc
@@ -8,7 +8,7 @@ float foo(float a, float b) {
 }
 static float bar(float a, float b) {
-    return min(a, b);
+    return a < b ? a : b;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
@@ -14,10 +14,10 @@ export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    varying int3 vv = array[a];
    ++vv.y;
    array[a] = vv;
 //CO    print("fin %\n", array[programIndex].y);
    ret[programIndex] = array[programIndex].y;
 }
 export void result(uniform float ret[]) {
-    ret[programIndex] = 100+programIndex;
+    ret[programIndex] = 101+programIndex;
    ret[0] = 100;
 }
--- a/tests/packed-load-1.ispc
+++ b/tests/packed-load-1.ispc
@@ -5,7 +5,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform unsigned int a[programCount];
    a[programIndex] = aFOO[programIndex];
    unsigned int aa;
-    packed_load_active(a, 0, aa);
+    packed_load_active(a, 0, &aa);
    RET[programIndex] = aa;
 }
--- a/tests/packed-load-2.ispc
+++ b/tests/packed-load-2.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    int aa = 15;
    uniform int count = 0;
    if (programIndex < 2)
-        count += packed_load_active(a, 0, aa);
+        count += packed_load_active(a, 0, &aa);
    RET[programIndex] = aa;
 }
--- a/tests/packed-load-3.ispc
+++ b/tests/packed-load-3.ispc
@@ -7,7 +7,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    int aa;
    uniform int count = 0;
    if (programIndex < 2)
-        count += packed_load_active(a, 0, aa);
+        count += packed_load_active(a, 0, &aa);
    RET[programIndex] = count;
 }
--- a/tests/packed-load-4.ispc
+++ b/tests/packed-load-4.ispc
@@ -8,7 +8,7 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    int aa = 32;
    uniform int count = 0;
    if (programIndex < 2)
-        count += packed_load_active(a, 5, aa);
+        count += packed_load_active(a, 5, &aa);
    RET[programIndex] = aa;
 }
--- a/tests/packed-load-5.ispc
+++ b/tests/packed-load-5.ispc
@@ -8,9 +8,9 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
    int aa = 32;
    uniform int count = 0;
    if (programIndex & 1)
-        count += packed_load_active(a, 10, aa);
+        count += packed_load_active(a, 10, &aa);
    if (!(programIndex & 1))
-        count += packed_load_active(a, 10+count, aa);
+        count += packed_load_active(a, 10+count, &aa);
    RET[programIndex] = aa;
 }
--- a/tests/pass-varying-lvalue-to-ref.ispc
+++ b/tests/pass-varying-lvalue-to-ref.ispc
@@ -1,14 +1,14 @@
 export uniform int width() { return programCount; }
-void inc(reference float v) { ++v; }
+void inc(uniform float * varying v) { ++(*v); }
 export void f_fu(uniform float ret[], uniform float aa[], uniform float b) {
    uniform float foo[32];
    for (uniform int i = 0; i < 32; ++i)
        foo[i] = 10+i;
    int a = (int)aa[programIndex];
-    inc(foo[a]);
+    inc(&foo[a]);
    ret[programIndex] = foo[programIndex]-programIndex;
 }
--- a/tests/prefetch.ispc
+++ b/tests/prefetch.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    prefetch_l1(aFOO);
    prefetch_l2(aFOO);
    prefetch_l3(aFOO);
    prefetch_nt(aFOO);
    float a = aFOO[programIndex]; 
    float b = 0.; b = a; 
    RET[programIndex] = a+b; 
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2 + 2*programIndex;
 }
--- a/tests/ptr-1.ispc
+++ b/tests/ptr-1.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform int a = 1;
    uniform int * uniform b = &a;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1;
 }
--- a/tests/ptr-10.ispc
+++ b/tests/ptr-10.ispc
@@ -0,0 +1,11 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1;
 }
--- a/tests/ptr-11.ispc
+++ b/tests/ptr-11.ispc
@@ -0,0 +1,16 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = &aFOO[0];
    b = b + 3;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 4;
 }
--- a/tests/ptr-12.ispc
+++ b/tests/ptr-12.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = &aFOO[0];
    ++b;
    b++;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
 }
--- a/tests/ptr-13.ispc
+++ b/tests/ptr-13.ispc
@@ -0,0 +1,17 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = &aFOO[0];
    b += 3;
    b -= 1;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 3;
 }
--- a/tests/ptr-14.ispc
+++ b/tests/ptr-14.ispc
@@ -0,0 +1,15 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float a = aFOO[programIndex];
    float * uniform pa = &a;
    int * uniform pb = (int *)pa;
    float *uniform pc = (float *)pb;
    *pc = programIndex;
    RET[programIndex] = *pc;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/ptr-15.ispc
+++ b/tests/ptr-15.ispc
@@ -0,0 +1,18 @@
 export uniform int width() { return programCount; }
 void foo(uniform float * uniform * ret) {
    uniform float *px = *ret;
    ++px;
    *ret = px;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform ptr = &aFOO[0];
    foo(&ptr);
    RET[programIndex] = *ptr;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
 }
--- a/tests/ptr-16.ispc
+++ b/tests/ptr-16.ispc
@@ -0,0 +1,11 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    RET[programIndex] = b[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
 }
--- a/tests/ptr-17.ispc
+++ b/tests/ptr-17.ispc
@@ -0,0 +1,16 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    b[programIndex] = programCount - programIndex;
    RET[programIndex] = aFOO[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programCount - programIndex;
 }
--- a/tests/ptr-18.ispc
+++ b/tests/ptr-18.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    b += 10;
    RET[programIndex] = b[-5];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 6;
 }
--- a/tests/ptr-19.ispc
+++ b/tests/ptr-19.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    b += 10;
    int8 index = -5;
    RET[programIndex] = b[index];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 6;
 }
--- a/tests/ptr-2.ispc
+++ b/tests/ptr-2.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
 }
--- a/tests/ptr-20.ispc
+++ b/tests/ptr-20.ispc
@@ -0,0 +1,13 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform b = aFOO;
    b += 10;
    uniform int8 index = -5;
    RET[programIndex] = b[index];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 6;
 }
--- a/tests/ptr-21.ispc
+++ b/tests/ptr-21.ispc
@@ -0,0 +1,22 @@
 export uniform int width() { return programCount; }
 struct Foo {
    int a;
    uniform float b;
 };
 void update(Foo * uniform fp) {
    fp->a += 1;
    fp->b = 1;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    Foo f = { aFOO[programIndex], 5 };
    update(&f);
    RET[programIndex] = f.b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1;
 }
--- a/tests/ptr-22.ispc
+++ b/tests/ptr-22.ispc
@@ -0,0 +1,23 @@
 export uniform int width() { return programCount; }
 struct Foo {
    int a;
    uniform float b;
 };
 void update(Foo * varying fp) {
    ++fp;
    fp->a -= 1;
    fp->b = 1;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    Foo f[2] = { { 1234, 4321 }, { aFOO[programIndex], 5 } };
    update(f);
    RET[programIndex] = f[1].a;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = programIndex;
 }
--- a/tests/ptr-23.ispc
+++ b/tests/ptr-23.ispc
@@ -0,0 +1,21 @@
 export uniform int width() { return programCount; }
 struct Foo {
    int a;
    uniform float b;
 };
 void update(float<3> * uniform vp) {
    vp->x = 0;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float<3> v = { 1, 2, 3 };
    update(&v);
    RET[programIndex] = v.x;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 0;
 }
--- a/tests/ptr-24.ispc
+++ b/tests/ptr-24.ispc
@@ -0,0 +1,24 @@
 export uniform int width() { return programCount; }
 void update(uniform float<2> * varying vp) {
    vp->y = 0;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float<2> v[programCount];
    for (uniform int i = 0; i < programCount; ++i) {
        v[i].x = 2*i;
        v[i].y = 2*i+1;
    }
    int index = aFOO[programIndex] - 1;
    update(&v[programIndex]);
 //CO    for (uniform int i = 0; i < programCount; ++i) 
 //CO        print("%: % %\n", i, v[i].x, v[i].y);
    RET[programIndex] = v[programIndex].x + v[programIndex].y;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2*programIndex;
 }
--- a/tests/ptr-25.ispc
+++ b/tests/ptr-25.ispc
@@ -0,0 +1,24 @@
 export uniform int width() { return programCount; }
 void update(float<2> * varying vp) {
    vp->y = 0;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    float<2> v[programCount];
    for (uniform int i = 0; i < programCount; ++i) {
        v[i].x = 2*i;
        v[i].y = 2*i+1;
    }
    int index = aFOO[programIndex] - 1;
    update(&v[programIndex]);
 //CO    for (uniform int i = 0; i < programCount; ++i) 
 //CO        print("%: % %\n", i, v[i].x, v[i].y);
    RET[programIndex] = v[programIndex].x;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2*programIndex;
 }
--- a/tests/ptr-3.ispc
+++ b/tests/ptr-3.ispc
@@ -0,0 +1,14 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    *b = 2;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2;
 }
--- a/tests/ptr-4.ispc
+++ b/tests/ptr-4.ispc
@@ -0,0 +1,14 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    ++*b;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2+programIndex;
 }
--- a/tests/ptr-5.ispc
+++ b/tests/ptr-5.ispc
@@ -0,0 +1,14 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    (*b)++;
    RET[programIndex] = *b;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 2+programIndex;
 }
--- a/tests/ptr-6.ispc
+++ b/tests/ptr-6.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * uniform ap = &aFOO[0];
    RET[programIndex] = ap[programIndex];
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
 }
--- a/tests/ptr-7.ispc
+++ b/tests/ptr-7.ispc
@@ -0,0 +1,12 @@
 export uniform int width() { return programCount; }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    uniform float * varying ap = &aFOO[programIndex];
    RET[programIndex] = *ap;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
 }
--- a/tests/ptr-8.ispc
+++ b/tests/ptr-8.ispc
@@ -0,0 +1,20 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    if (a <= 2)
        inc(b);
    RET[programIndex] = a;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
    RET[0] = 2;
    RET[1] = 3;
 }
--- a/tests/ptr-9.ispc
+++ b/tests/ptr-9.ispc
@@ -0,0 +1,18 @@
 export uniform int width() { return programCount; }
 void inc(int * uniform v) {
    ++*v;
 }
 export void f_f(uniform float RET[], uniform float aFOO[]) {
    int a = aFOO[programIndex];
    int * uniform b = &a;
    void * uniform vp = b;
    int * uniform c = (int * uniform)vp;
    RET[programIndex] = *c;
 }
 export void result(uniform float RET[]) {
    RET[programIndex] = 1+programIndex;
 }
--- a/Show More
+++ b/Show More