Update release notes, doxygen version number

Update examples to use fpmath:fast and to enable intrinsics on Windows
Generalize FunctionEmitContext::PtrToIntInst and IntToPtrInst to
2011-07-01 05:12:57 +01:00 · 2011-06-30 13:17:14 -07:00 · 2011-06-29 12:38:12 +01:00 · 2011-06-29 12:26:44 +01:00 · 2011-06-29 09:32:31 +01:00 · 2011-06-29 07:59:43 +01:00
38 changed files with 881 additions and 348 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ depend
 ispc
 ispc_test
 objs
+docs/doxygen
--- a/4
+++ b/4
@@ -94,9 +94,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-$(STDLIB_SRC): stdlib.m4
-
-objs/stdlib-%.cpp: stdlib-%.ll
+objs/stdlib-%.cpp: stdlib-%.ll stdlib.m4 stdlib-sse.ll
 	@echo Creating C++ source from stdlib file $<
 	@m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@

--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1109,7 +1109,7 @@ FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type,
 }


-llvm::Instruction *
+llvm::Value *
 FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
@@ -1117,16 +1117,31 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type,
        return NULL;
    }

-    // TODO: we should probably handle the array case as in
-    // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
-    AddDebugPos(inst);
-    return inst;
+    const llvm::Type *valType = value->getType();
+    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
+    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+        // varying lvalue -> apply ptr to int to the individual pointers
+        assert((int)at->getNumElements() == g->target.vectorWidth);
+
+        llvm::Value *ret = 
+            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *elt = ExtractInst(value, i);
+            llvm::Value *p2i = PtrToIntInst(elt, type, name);
+            ret = InsertInst(ret, p2i, i);
+        }
+        return ret;
+    }
+    else {
+        llvm::Instruction *inst = 
+            new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
 }


-llvm::Instruction *
+llvm::Value *
 FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
                                  const char *name) {
    if (value == NULL) {
@@ -1134,12 +1149,27 @@ FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type,
        return NULL;
    }

-    // TODO: we should probably handle the array case as in
-    // e.g. BitCastInst(), but we don't currently need that functionality
-    llvm::Instruction *inst = 
-        new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
-    AddDebugPos(inst);
-    return inst;
+    const llvm::Type *valType = value->getType();
+    const llvm::ArrayType *at = llvm::dyn_cast<const llvm::ArrayType>(valType);
+    if (at && llvm::isa<const llvm::PointerType>(at->getElementType())) {
+        // varying lvalue -> apply int to ptr to the individual pointers
+        assert((int)at->getNumElements() == g->target.vectorWidth);
+
+        llvm::Value *ret = 
+            llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth));
+        for (int i = 0; i < g->target.vectorWidth; ++i) {
+            llvm::Value *elt = ExtractInst(value, i);
+            llvm::Value *i2p = IntToPtrInst(elt, type, name);
+            ret = InsertInst(ret, i2p, i);
+        }
+        return ret;
+    }
+    else {
+        llvm::Instruction *inst = 
+            new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock);
+        AddDebugPos(inst);
+        return inst;
+    }
 }


@@ -1359,10 +1389,10 @@ FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type,
        // If we're gathering structures, do an element-wise gather
        // recursively.
        llvm::Value *retValue = llvm::UndefValue::get(retType);
-        for (int i = 0; i < st->NumElements(); ++i) {
+        for (int i = 0; i < st->GetElementCount(); ++i) {
            llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i);
            // This in turn will be another gather
-            llvm::Value *eltValues = LoadInst(eltPtrs, st->GetMemberType(i), 
+            llvm::Value *eltValues = LoadInst(eltPtrs, st->GetElementType(i), 
                                              name);
            retValue = InsertInst(retValue, eltValues, i, "set_value");
        }
@@ -1482,6 +1512,16 @@ FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name,
        // current basic block
        inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock);

+    // If no alignment was specified but we have an array of a uniform
+    // type, then align it to 4 * the native vector width; it's not
+    // unlikely that this array will be loaded into varying variables with
+    // what will be aligned accesses if the uniform -> varying load is done
+    // in regular chunks.
+    const llvm::ArrayType *arrayType = llvm::dyn_cast<const llvm::ArrayType>(llvmType);
+    if (align == 0 && arrayType != NULL && 
+        !llvm::isa<const llvm::VectorType>(arrayType->getElementType()))
+        align = 4 * g->target.nativeVectorWidth;
+
    if (align != 0)
        inst->setAlignment(align);
    // Don't add debugging info to alloca instructions
@@ -1506,29 +1546,18 @@ FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue,

    assert(llvm::isa<const llvm::PointerType>(lvalue->getType()));
    
-    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
-    if (structType != NULL) {
-        // Assigning a structure
-        for (int i = 0; i < structType->NumElements(); ++i) {
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(rvalueType);
+    if (collectionType != NULL) {
+        // Assigning a structure / array / vector. Handle each element
+        // individually with what turns into a recursive call to
+        // makedStore()
+        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
            llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member");
            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, 
                                                       "struct_lvalue_ptr");
            StoreInst(eltValue, eltLValue, storeMask, 
-                      structType->GetMemberType(i));
-        }
-        return;
-    }
-
-    const SequentialType *sequentialType = 
-        dynamic_cast<const SequentialType *>(rvalueType);
-    if (sequentialType != NULL) {
-        // Assigning arrays and vectors. Handle each element individually
-        // with what turns into a recursive call to makedStore()
-        for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
-            llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, "lval_i_ptr");
-            llvm::Value *eltValue = ExtractInst(rvalue, i, "array_i_val");
-            StoreInst(eltValue, eltLValue, storeMask, 
-                      sequentialType->GetElementType());
+                      collectionType->GetElementType(i));
        }
        return;
    }
@@ -1588,10 +1617,10 @@ FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue,
    const StructType *structType = dynamic_cast<const StructType *>(rvalueType);
    if (structType) {
        // Scatter the struct elements individually
-        for (int i = 0; i < structType->NumElements(); ++i) {
+        for (int i = 0; i < structType->GetElementCount(); ++i) {
            llvm::Value *lv = GetElementPtrInst(lvalue, 0, i);
            llvm::Value *rv = ExtractInst(rvalue, i);
-            scatter(rv, lv, storeMask, structType->GetMemberType(i));
+            scatter(rv, lv, storeMask, structType->GetElementType(i));
        }
        return;
    }
--- a/ctx.h
+++ b/ctx.h
@@ -305,10 +305,10 @@ public:

    llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type,
                             const char *name = NULL);
-    llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
-                                    const char *name = NULL);
-    llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
-                                    const char *name = NULL);
+    llvm::Value *PtrToIntInst(llvm::Value *value, const llvm::Type *type,
+                              const char *name = NULL);
+    llvm::Value *IntToPtrInst(llvm::Value *value, const llvm::Type *type,
+                              const char *name = NULL);
    llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type,
                                 const char *name = NULL);
    llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value,
--- a/decl.cpp
+++ b/decl.cpp
@@ -318,9 +318,10 @@ Declaration::Print() const {
 ///////////////////////////////////////////////////////////////////////////

 void
-GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
-                       std::vector<const Type *> *elementTypes,
-                       std::vector<std::string> *elementNames) {
+GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                             std::vector<const Type *> *elementTypes,
+                             std::vector<std::string> *elementNames,
+                             std::vector<SourcePos> *elementPositions) {
    for (unsigned int i = 0; i < sd.size(); ++i) {
        const Type *type = sd[i]->type;
        // FIXME: making this fake little DeclSpecs here is really
@@ -343,6 +344,7 @@ GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,

            elementTypes->push_back(d->sym->type);
            elementNames->push_back(d->sym->name);
+            elementPositions->push_back(d->sym->pos);
        }
    }
 }
--- a/decl.h
+++ b/decl.h
@@ -196,8 +196,9 @@ struct StructDeclaration {

 /** Given a set of StructDeclaration instances, this returns the types of
    the elements of the corresponding struct and their names. */
-extern void GetStructTypesAndNames(const std::vector<StructDeclaration *> &sd,
-                                   std::vector<const Type *> *elementTypes,
-                                   std::vector<std::string> *elementNames);
+extern void GetStructTypesNamesPositions(const std::vector<StructDeclaration *> &sd,
+                                         std::vector<const Type *> *elementTypes,
+                                         std::vector<std::string> *elementNames,
+                                         std::vector<SourcePos> *elementPositions);

 #endif // ISPC_DECL_H
--- a/docs/ReleaseNotes.txt
+++ b/docs/ReleaseNotes.txt
@@ -0,0 +1,26 @@
+=== v1.0.2 ===
+
+Floating-point hexidecimal constants are now parsed correctly on Windows
+(fixes issue #16).
+
+SSE2 is now the default target if --cpu=atom is given in the command line
+arguments and another target isn't explicitly specified.
+
+The standard library now provides broadcast(), rotate(), and shuffle()
+routines for efficient communication between program instances.
+
+The MSVC solution files to build the examples on Windows now use
+/fpmath:fast when building.
+
+=== v1.0.1 === (24 June 2011)
+
+ispc no longer requires that pointers to memory that are passed in to ispc
+have alignment equal to the targets vector width; now alignment just has to
+be the regular element alignment (e.g. 4 bytes for floats, etc.)  This
+change also fixed a number of cases where it previously incorrectly
+generated aligned load/store instructions in cases where the address wasn't
+actually aligned (even if the base address passed into ispc code was).
+
+=== v1.0 === (21 June 2011)
+
+Initial Release
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -74,7 +74,8 @@ Contents:

  + `Math Functions`_
  + `Output Functions`_
-  + `Cross-Lane Operations`_
+  + `Cross-Program Instance Operations`_
+  + `Packed Load and Store Operations`_
  + `Low-Level Bits`_

 * `Interoperability with the Application`_
@@ -136,7 +137,7 @@ Linux\* and Mac OS\* available for download.  Alternatively, you can
 download the source code from that page and build it yourself; see see the
 `ispc wiki`_ for instructions about building ``ispc`` from source.

-.. _ispc downloads web page:downloads.html
+.. _ispc downloads web page: downloads.html
 .. _ispc wiki: http://github.com/ispc/ispc/wiki

 Once you have an executable for your system, copy it into a directory
@@ -340,7 +341,7 @@ before it's compiled.  On Windows®, pre-processor definitions should be
 provided to the ``cl`` call.

 By default, the compiler generates x86-64 Intel® SSE4 code.  To generate
-32-bit code, you can use the the ``--arch=x86`` command-line flag.  To
+32-bit code, you can use the ``--arch=x86`` command-line flag.  To
 select Intel® SSE2, use ``--target=sse2``.

 ``ispc`` supports an alternative method for generating Intel® SSE4 code,
@@ -1246,7 +1247,7 @@ section.)
 For ``if`` statements where the different running SPMD program instances
 don't have coherent values for the boolean ``if`` test, using ``cif``
 introduces some additional overhead from the ``all`` and ``any`` tests as
-well as the corresponding branches.  For cases where the the program
+well as the corresponding branches.  For cases where the program
 instances often do compute the same boolean value, this overhead is
 worthwhile.  If the control flow is in fact usually incoherent, this
 overhead only costs performance.
@@ -1659,14 +1660,14 @@ values for the inactive program instances aren't printed.  (In other cases,
 they may have garbage values or be otherwise undefined.)


-Cross-Lane Operations
---------------------
+Cross-Program Instance Operations
+---------------------------------

-Usually, ``ispc`` code expresses independent computation on separate data
-elements.  There are, however, a number of cases where it's useful for the
-program instances to be able to cooperate in computing results.  The
-cross-lane operations described in this section provide primitives for
-communication between the running program instances.
+Usually, ``ispc`` code expresses independent programs performing
+computation on separate data elements.  There are, however, a number of
+cases where it's useful for the program instances to be able to cooperate
+in computing results.  The cross-lane operations described in this section
+provide primitives for communication between the running program instances.
 
 A few routines that evaluate conditions across the running program
 instances.  For example, ``any()`` returns ``true`` if the given value
@@ -1678,6 +1679,47 @@ and ``all()`` returns ``true`` if it true for all of them.
    uniform bool any(bool v)
    uniform bool all(bool v)

+To broadcast a value from one program instance to all of the others, a
+``broadcast()`` function is available.  It broadcasts the value of the
+``value`` parameter for the program instance given by ``index`` to all of
+the running program instances.
+
+::
+
+    float broadcast(float value, uniform int index)
+    int32 broadcast(int32 value, uniform int index)
+    double broadcast(double value, uniform int index)
+    int64 broadcast(int64 value, uniform int index)
+
+The ``rotate()`` function allows each program instance to find the value of
+the given value that their neighbor ``offset`` steps away has.  For
+example, on an 8-wide target, if ``offset`` has the value (1, 2, 3, 4, 5,
+6, 7, 8) in each of the running program instances, then ``rotate(value,
+-1)`` causes the first program instance to get the value 8, the second
+program instance to get the value 1, the third 2, and so forth.  The
+provided offset value can be positive or negative, and may be greater than
+``programCount`` (it is masked to ensure valid offsets).
+
+::
+
+    float rotate(float value, uniform int offset)
+    int32 rotate(int32 value, uniform int offset)
+    double rotate(double value, uniform int offset)
+    int64 rotate(int64 value, uniform int offset)
+
+
+Finally, ``shuffle()`` allows fully general shuffling of values among the
+program instances.  Each program instance's value of permutation gives the
+program instance from which to get the value of ``value``.  The provided
+values for ``permutation`` must all be between 0 and ``programCount-1``.
+
+::
+
+    float shuffle(float value, int permutation)
+    int32 shuffle(int32 value, int permutation)
+    double shuffle(double value, int permutation)
+    int64 shuffle(int64 value, int permutation)
+
 The various variants of ``popcnt()`` return the population count--the
 number of bits set in the given value.

@@ -1719,8 +1761,12 @@ given value across all of the currently-executing vector lanes.
    uniform unsigned int reduce_max(unsigned int a, unsigned int b)


-Finally, there are routines for writing out and reading in values from
-linear memory locations for the active program instances.
+
+Packed Load and Store Operations
+--------------------------------
+
+The standard library also offers routines for writing out and reading in
+values from linear memory locations for the active program instances.
 ``packed_load_active()`` loads consecutive values from the given array,
 starting at ``a[offset]``, loading one value for each currently-executing
 program instance and storing it into that program instance's ``val``
@@ -1797,14 +1843,15 @@ and this conversion step are necessary because ``ispc`` doesn't have native
    void store_to_int16(uniform int a[], uniform int offset, 
                        unsigned int val)

-There are two things to note in these functions.  First, note that these
+There are three things to note in these functions.  First, note that these
 functions take ``unsigned int`` arrays as parameters; you need
 to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to
 ``unsigned int`` when passing them to ``ispc`` code.  Second, although the
 arrays are passed as ``unsigned int``, in the array indexing calculation,
 with the ``offset`` parameter, they are treated as if they were ``int8`` or
 ``int16`` types.  (i.e. the offset treated as being in terms of number of 8
-or 16-bit elements.)
+or 16-bit elements.) Third, note that programIndex is implicitly added
+to offset.

 The ``intbits()`` and ``floatbits()`` functions can be used to implement
 low-level floating-point bit twiddling.  For example, ``intbits()`` returns
@@ -2279,21 +2326,11 @@ elements to work with and then proceeds with the computation.
 Communicating Between SPMD Program Instances
 --------------------------------------------

-The ``programIndex`` built-in variable (see `Mapping Data To Program
-Instances`_) can be used to communicate between the set of executing
-program instances.  Consider the following code, which shows all of the
-program instances writing into unique locations in an array.
-
-::
-
-    float x = ...;
-    uniform float allX[programCount];
-    allX[programIndex] = x;
-
-In this code, a program instance that reads ``allX[0]`` finds the value of
-``x`` that was computed by the first of the running program instances, and
-so forth.  Program instances can communicate with their neighbor instances
-with indexing like ``allX[(programIndex+1)%programCount]``.
+The ``broadcast()``, ``rotate()``, and ``shuffle()`` standard library
+routines provide a variety of mechanisms for the running program instances
+to communicate values to each other during execution.  See the section
+`Cross-Program Instance Operations`_ for more information about their
+operation.


 Gather and Scatter
--- a/doxygen.cfg
+++ b/doxygen.cfg
@@ -31,7 +31,7 @@ PROJECT_NAME           = "Intel SPMD Program Compiler"
 # This could be handy for archiving the generated documentation or
 # if some version control system is used.

-PROJECT_NUMBER         = 1.0
+PROJECT_NUMBER         = 1.0.2

 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
 # base path where the generated documentation will be put.
--- a/examples/aobench/aobench.vcxproj
+++ b/examples/aobench/aobench.vcxproj
@@ -102,6 +102,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -115,6 +117,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -130,6 +134,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -147,6 +152,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/mandelbrot/mandelbrot.vcxproj
+++ b/examples/mandelbrot/mandelbrot.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -158,4 +164,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
+++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -54,8 +54,6 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
                                float result[], int count);

 int main() {
-    // Pointers passed to ispc code must have alignment of the target's
-    // vector width at minimum.
    float *S = new float[N_OPTIONS];
    float *X = new float[N_OPTIONS];
    float *T = new float[N_OPTIONS];
--- a/examples/options/options.vcxproj
+++ b/examples/options/options.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -82,6 +82,8 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -96,6 +98,8 @@
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -112,6 +116,7 @@
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -130,6 +135,7 @@
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <DisableSpecificWarnings>4305</DisableSpecificWarnings>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -165,4 +171,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/examples/rt/rt.vcxproj
+++ b/examples/rt/rt.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -81,6 +81,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -94,6 +96,8 @@
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -109,6 +113,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -126,6 +131,7 @@
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <FloatingPointModel>Fast</FloatingPointModel>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@@ -162,4 +168,4 @@ cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/expr.cpp
+++ b/expr.cpp
@@ -1526,7 +1526,7 @@ AssignExpr::GetValue(FunctionEmitContext *ctx) const {
        if (st != NULL) {
            bool anyUniform = false;
            for (int i = 0; i < st->NumElements(); ++i) {
-                if (st->GetMemberType(i)->IsUniformType())
+                if (st->GetElementType(i)->IsUniformType())
                    anyUniform = true;
            }

@@ -2489,71 +2489,57 @@ ExprList::TypeCheck() {

 llvm::Constant *
 ExprList::GetConstant(const Type *type) const {
-    const StructType *structType = dynamic_cast<const StructType *>(type);
-    const SequentialType *sequentialType = 
-        dynamic_cast<const SequentialType *>(type);
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(type);
+    if (collectionType == NULL)
+        return NULL;

-    if (structType != NULL) {
-        // We can potentially return an llvm::ConstantStruct if we have the
-        // same number of elements in the ExprList as the struct has
-        // members (and the various elements line up with the shape of the
-        // corresponding struct elements).
-        if ((int)exprs.size() != structType->NumElements()) {
-            Error(pos, "Initializer list for struct \"%s\" must have %d "
-                  "elements (has %d).", structType->GetString().c_str(),
-                  (int)exprs.size(), structType->NumElements());
+    std::string name;
+    if (dynamic_cast<const StructType *>(type) != NULL)
+        name = "struct";
+    else if (dynamic_cast<const ArrayType *>(type) != NULL) 
+        name = "array";
+    else if (dynamic_cast<const VectorType *>(type) != NULL) 
+        name = "vector";
+    else 
+        FATAL("Unexpected CollectionType in ExprList::GetConstant()");
+
+    if ((int)exprs.size() != collectionType->GetElementCount()) {
+        Error(pos, "Initializer list for %s \"%s\" must have %d elements "
+              "(has %d).", name.c_str(), collectionType->GetString().c_str(),
+              collectionType->GetElementCount(), (int)exprs.size());
+        return NULL;
+    }
+
+    std::vector<llvm::Constant *> cv;
+    for (unsigned int i = 0; i < exprs.size(); ++i) {
+        if (exprs[i] == NULL)
            return NULL;
-        }
-
-        std::vector<llvm::Constant *> cv;
-        for (unsigned int i = 0; i < exprs.size(); ++i) {
-            if (exprs[i] == NULL)
-                return NULL;
-            const Type *elementType = structType->GetMemberType(i);
-            llvm::Constant *c = exprs[i]->GetConstant(elementType);
-            if (c == NULL)
-                // If this list element couldn't convert to the right
-                // constant type for the corresponding struct member, then
-                // give up
-                return NULL;
-            cv.push_back(c);
-        }
+        const Type *elementType = collectionType->GetElementType(i);
+        llvm::Constant *c = exprs[i]->GetConstant(elementType);
+        if (c == NULL)
+            // If this list element couldn't convert to the right constant
+            // type for the corresponding collection member, then give up.
+            return NULL;
+        cv.push_back(c);
+    }

+    if (dynamic_cast<const StructType *>(type) != NULL) {
 #if defined(LLVM_2_8) || defined(LLVM_2_9)
        return llvm::ConstantStruct::get(*g->ctx, cv, false);
 #else
        const llvm::StructType *llvmStructType =
-            llvm::dyn_cast<const llvm::StructType>(structType->LLVMType(g->ctx));
+            llvm::dyn_cast<const llvm::StructType>(collectionType->LLVMType(g->ctx));
        assert(llvmStructType != NULL);
        return llvm::ConstantStruct::get(llvmStructType, cv);
 #endif
    }
-    else if (sequentialType) {
-        // Similarly, if we have an array or vector type, we may be able to
-        // return the corresponding llvm constant value.
-        if ((int)exprs.size() != sequentialType->GetElementCount()) {
-            bool isArray = (dynamic_cast<const ArrayType *>(type) != NULL);
-            Error(pos, "Initializer list for %s \"%s\" must have %d elements (has %d).",
-                  isArray ? "array" : "vector", sequentialType->GetString().c_str(),
-                  (int)exprs.size(), sequentialType->GetElementCount());
-            return NULL;
-        }
-
-        std::vector<llvm::Constant *> cv;
-        for (unsigned int i = 0; i < exprs.size(); ++i) {
-            if (exprs[i] == NULL)
-                return NULL;
-            const Type *elementType = sequentialType->GetElementType();
-            llvm::Constant *c = exprs[i]->GetConstant(elementType);
-            if (c == NULL) 
-                return NULL;
-            cv.push_back(c);
-        }
-        
+    else {
        const llvm::Type *lt = type->LLVMType(g->ctx);
        const llvm::ArrayType *lat = llvm::dyn_cast<const llvm::ArrayType>(lt);
        // FIXME: should the assert below validly fail for uniform vectors
-        // now?
+        // now?  Need a test case to reproduce it and then to be sure we
+        // have the right fix; leave the assert until we can hit it...
        assert(lat != NULL);
        return llvm::ConstantArray::get(lat, cv);
    }
@@ -2832,7 +2818,7 @@ MemberExpr::GetType() const {
        // Otherwise it's a struct, and the result type is the element
        // type, possibly promoted to varying if the struct type / lvalue
        // is varying.
-        const Type *elementType = structType->GetMemberType(identifier);
+        const Type *elementType = structType->GetElementType(identifier);
        if (!elementType)
            Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s",
                  identifier.c_str(), structType->GetString().c_str(),
@@ -2912,7 +2898,7 @@ MemberExpr::getElementNumber() const {
        }
    }
    else {
-        elementNumber = structType->GetMemberNumber(identifier);
+        elementNumber = structType->GetElementNumber(identifier);
        if (elementNumber == -1)
            Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s",
                  identifier.c_str(), structType->GetString().c_str(),
@@ -3004,7 +2990,7 @@ MemberExpr::getCandidateNearMatches() const {
        return "";

    std::vector<std::string> elementNames;
-    for (int i = 0; i < structType->NumElements(); ++i)
+    for (int i = 0; i < structType->GetElementCount(); ++i)
        elementNames.push_back(structType->GetElementName(i));
    std::vector<std::string> alternates = MatchStrings(identifier, elementNames);
    if (!alternates.size())
@@ -3900,25 +3886,14 @@ lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value,
    const llvm::Type *llvmType = type->GetAsVaryingType()->LLVMType(g->ctx);
    llvm::Value *retValue = llvm::UndefValue::get(llvmType);

-    // for structs, just recursively make their elements varying (if
-    // needed) and populate the return struct
-    const StructType *structType = dynamic_cast<const StructType *>(type);
-    if (structType != NULL) {
-        for (int i = 0; i < structType->NumElements(); ++i) {
-            llvm::Value *v = ctx->ExtractInst(value, i, "struct_element");
-            v = lUniformValueToVarying(ctx, v, structType->GetMemberType(i));
-            retValue = ctx->InsertInst(retValue, v, i, "set_struct_element");
-        }
-        return retValue;
-    }
-
-    // And similarly do the elements of arrays and vectors individually
-    const SequentialType *sequentialType = 
-        dynamic_cast<const SequentialType *>(type);
-    if (sequentialType != NULL) {
-        for (int i = 0; i < sequentialType->GetElementCount(); ++i) {
+    // for structs/arrays/vectors, just recursively make their elements
+    // varying (if needed) and populate the return value.
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(type);
+    if (collectionType != NULL) {
+        for (int i = 0; i < collectionType->GetElementCount(); ++i) {
            llvm::Value *v = ctx->ExtractInst(value, i, "get_element");
-            v = lUniformValueToVarying(ctx, v, sequentialType->GetElementType());
+            v = lUniformValueToVarying(ctx, v, collectionType->GetElementType(i));
            retValue = ctx->InsertInst(retValue, v, i, "set_element");
        }
        return retValue;
--- a/lex.ll
+++ b/lex.ll
@@ -45,6 +45,7 @@ static void lCComment(SourcePos *);
 static void lCppComment(SourcePos *);
 static void lHandleCppHash(SourcePos *);
 static void lStringConst(YYSTYPE *, SourcePos *);
+static double lParseHexFloat(const char *ptr);

 #define YY_USER_ACTION \
    yylloc->first_line = yylloc->last_line; \
@@ -65,7 +66,8 @@ inline int isatty(int) { return 0; }

 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))
-FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?)
+FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
+HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)

 IDENT [a-zA-Z_][a-zA-Z_0-9]*

@@ -182,13 +184,15 @@ L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL;
 }

 {FLOAT_NUMBER} { 
-    /* FIXME: need to implement a hex float constant parser so that we can 
-       support them on Windows (which doesn't handle them in its atof()
-       implementation... */
    yylval->floatVal = atof(yytext); 
    return TOKEN_FLOAT_CONSTANT; 
 }

+{HEX_FLOAT_NUMBER} {
+    yylval->floatVal = lParseHexFloat(yytext); 
+    return TOKEN_FLOAT_CONSTANT; 
+}
+
 "++" { return TOKEN_INC_OP; }
 "--" { return TOKEN_DEC_OP; }
 "<<" { return TOKEN_LEFT_OP; }
@@ -424,3 +428,82 @@ lStringConst(YYSTYPE *yylval, SourcePos *pos)
    } 
    yylval->stringVal = new std::string(str);
 }
+
+
+/** Compute the value 2^n, where the exponent is given as an integer.
+    There are more efficient ways to do this, for example by just slamming
+    the bits into the appropriate bits of the double, but let's just do the
+    obvious thing. 
+*/
+static double
+ipow2(int exponent) {
+    if (exponent < 0)
+        return 1. / ipow2(-exponent);
+
+    double ret = 1.;
+    while (exponent > 16) {
+        ret *= 65536.;
+        exponent -= 16;
+    }
+    while (exponent-- > 0)
+        ret *= 2.;
+    return ret;
+}
+
+
+/** Parse a hexadecimal-formatted floating-point number (C99 hex float
+    constant-style). 
+*/
+static double
+lParseHexFloat(const char *ptr) {
+    assert(ptr != NULL);
+
+    assert(ptr[0] == '0' && ptr[1] == 'x');
+    ptr += 2;
+
+    // Start initializing the mantissa
+    assert(*ptr == '0' || *ptr == '1');
+    double mantissa = (*ptr == '1') ? 1. : 0.;
+    ++ptr;
+
+    if (*ptr == '.') {
+        // Is there a fraction part?  If so, the i'th digit we encounter
+        // gives the 1/(16^i) component of the mantissa.
+        ++ptr;
+
+        double scale = 1. / 16.;
+        // Keep going until we come to the 'p', which indicates that we've
+        // come to the exponent
+        while (*ptr != 'p') {
+            // Figure out the raw value from 0-15
+            int digit;
+            if (*ptr >= '0' && *ptr <= '9')
+                digit = *ptr - '0';
+            else if (*ptr >= 'a' && *ptr <= 'f')
+                digit = 10 + *ptr - 'a';
+            else {
+                assert(*ptr >= 'A' && *ptr <= 'F');
+                digit = 10 + *ptr - 'A';
+            }
+
+            // And add its contribution to the mantissa
+            mantissa += scale * digit;
+            scale /= 16.;
+            ++ptr;
+        }
+    }
+    else
+        // If there's not a '.', then we better be going straight to the
+        // exponent
+        assert(*ptr == 'p');
+
+    ++ptr; // skip the 'p'
+
+    // interestingly enough, the exponent is provided base 10..
+    int exponent = (int)strtol(ptr, (char **)NULL, 10);
+
+    // Does stdlib exp2() guarantee exact results for integer n where can
+    // be represented exactly as doubles?  I would hope so but am not sure,
+    // so let's be sure.
+    return mantissa * ipow2(exponent);
+}
--- a/main.cpp
+++ b/main.cpp
@@ -91,7 +91,7 @@ static void usage(int ret) {
    printf("        disable-gather-scatter-flattening\tDisable flattening when all lanes are on\n");
    printf("        disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
    printf("        disable-masked-store-optimizations\tDisable lowering to regular stores when possible\n");
-    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default)\n");
+    printf("    [--target={sse2,sse4,sse4x2,avx}] Select target ISA (SSE4 is default unless compiling for atom; then SSE2 is.)\n");
    printf("    [--version]\t\t\t\tPrint ispc version\n");
    printf("    [--woff]\t\t\t\tDisable warnings\n");
    printf("    [--wno-perf]\t\t\tDon't issue warnings related to performance-related issues\n");
@@ -192,7 +192,7 @@ int main(int Argc, char *Argv[]) {
    // as we're parsing below
    g = new Globals;

-    bool debugSet = false, optSet = false;
+    bool debugSet = false, optSet = false, targetSet = false;
    Module::OutputType ot = Module::Object;

    for (int i = 1; i < argc; ++i) {
@@ -226,6 +226,7 @@ int main(int Argc, char *Argv[]) {
        else if (!strcmp(argv[i], "--target")) {
            if (++i == argc) usage(1);
            lDoTarget(argv[i]);
+            targetSet = true;
        }
        else if (!strncmp(argv[i], "--target=", 9)) {
            const char *target = argv[i] + 9;
@@ -315,6 +316,11 @@ int main(int Argc, char *Argv[]) {
    if (debugSet && !optSet)
        g->opt.level = 0;

+    // Make SSE2 the default target on atom unless the target has been set
+    // explicitly.
+    if (!targetSet && (g->target.cpu == "atom"))
+        lDoTarget("sse2");
+
    m = new Module(file);
    if (m->CompileFile() == 0) {
        if (outFileName != NULL)
--- a/module.cpp
+++ b/module.cpp
@@ -248,8 +248,8 @@ lRecursiveCheckVarying(const Type *t) {

    const StructType *st = dynamic_cast<const StructType *>(t);
    if (st) {
-        for (int i = 0; i < st->NumElements(); ++i)
-            if (lRecursiveCheckVarying(st->GetMemberType(i)))
+        for (int i = 0; i < st->GetElementCount(); ++i)
+            if (lRecursiveCheckVarying(st->GetElementType(i)))
                return true;
    }
    return false;
@@ -1041,8 +1041,8 @@ Module::writeObjectFileOrAssembly(OutputType outputType, const char *outFileName
 static void
 lRecursiveAddStructs(const StructType *structType,
                     std::vector<const StructType *> &structParamTypes) {
-    for (int i = 0; i < structType->NumElements(); ++i) {
-        const Type *elementBaseType = structType->GetMemberType(i)->GetBaseType();
+    for (int i = 0; i < structType->GetElementCount(); ++i) {
+        const Type *elementBaseType = structType->GetElementType(i)->GetBaseType();
        const StructType *elementStructType = 
            dynamic_cast<const StructType *>(elementBaseType);
        if (elementStructType != NULL) {
@@ -1112,9 +1112,9 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
        StructDAGNode *node = new StructDAGNode;
        structToNode[st] = node;

-        for (int j = 0; j < st->NumElements(); ++j) {
+        for (int j = 0; j < st->GetElementCount(); ++j) {
            const StructType *elementStructType = 
-                dynamic_cast<const StructType *>(st->GetMemberType(j));
+                dynamic_cast<const StructType *>(st->GetElementType(j));
            // If this element is a struct type and we haven't already
            // processed it for the current struct type, then upate th
            // dependencies and record that this element type has other
@@ -1144,8 +1144,8 @@ lEmitStructDecls(std::vector<const StructType *> &structTypes, FILE *file) {
    for (unsigned int i = 0; i < sortedTypes.size(); ++i) {
        const StructType *st = sortedTypes[i];
        fprintf(file, "struct %s {\n", st->GetStructName().c_str());
-        for (int j = 0; j < st->NumElements(); ++j) {
-            const Type *type = st->GetMemberType(j)->GetAsNonConstType();
+        for (int j = 0; j < st->GetElementCount(); ++j) {
+            const Type *type = st->GetElementType(j)->GetAsNonConstType();
            std::string d = type->GetCDeclaration(st->GetElementName(j));
            fprintf(file, "    %s;\n", d.c_str());
        }
@@ -1210,8 +1210,8 @@ lGetVectorsFromStructs(const std::vector<const StructType *> &structParamTypes,
                       std::vector<const VectorType *> *vectorParamTypes) {
    for (unsigned int i = 0; i < structParamTypes.size(); ++i) {
        const StructType *structType = structParamTypes[i];
-        for (int j = 0; j < structType->NumElements(); ++j) {
-            const Type *elementType = structType->GetMemberType(j);
+        for (int j = 0; j < structType->GetElementCount(); ++j) {
+            const Type *elementType = structType->GetElementType(j);

            const ArrayType *at = dynamic_cast<const ArrayType *>(elementType);
            if (at)
--- a/opt.cpp
+++ b/opt.cpp
@@ -2116,11 +2116,12 @@ CreateLowerGatherScatterPass() {
 // IsCompileTimeConstantPass

 /** LLVM IR implementations of target-specific functions may include calls
-    to a function "bool __is_compile_time_constant_mask(mask type)"; this
-    allows them to have specialied code paths for where the mask is known
-    at compile time but not incurring the cost of a MOVMSK call at runtime
-    to compute its value in cases where the mask value isn't known until
-    runtime.
+    to the functions "bool __is_compile_time_constant_mask(mask type)" and
+    "bool __is_compile_time_constant_int32(i32)"; these allow them to have
+    specialied code paths for where the corresponding value is known at
+    compile time.  For masks, for example, this allows them to not incur
+    the cost of a MOVMSK call at runtime to compute its value in cases
+    where the mask value isn't known until runtime.

    This pass resolves these calls into either 'true' or 'false' values so
    that later optimization passes can operate with these as constants.
@@ -2148,17 +2149,17 @@ llvm::RegisterPass<IsCompileTimeConstantPass>

 bool
 IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {
-    llvm::Function *func = m->module->getFunction("__is_compile_time_constant_mask");
-    if (!func)
-        return false;
+    llvm::Function *maskFunc = m->module->getFunction("__is_compile_time_constant_mask");
+    llvm::Function *int32Func = m->module->getFunction("__is_compile_time_constant_int32");

    bool modifiedAny = false;
 restart:
    for (llvm::BasicBlock::iterator i = bb.begin(), e = bb.end(); i != e; ++i) {
-        // Iterate through the instructions looking for calls to
-        // __is_compile_time_constant_mask().
+        // Iterate through the instructions looking for calls to the
+        // __is_compile_time_constant_*() functions
        llvm::CallInst *callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
-        if (!callInst || callInst->getCalledFunction() != func)
+        if (!callInst || (callInst->getCalledFunction() != maskFunc &&
+                          callInst->getCalledFunction() != int32Func))
            continue;

        // This optimization pass can be disabled with the (poorly named)
@@ -2171,8 +2172,8 @@ IsCompileTimeConstantPass::runOnBasicBlock(llvm::BasicBlock &bb) {

        // Is it a constant?  Bingo, turn the call's value into a constant
        // true value.
-        llvm::Value *mask = callInst->getArgOperand(0);
-        if (llvm::isa<llvm::Constant>(mask)) {
+        llvm::Value *operand = callInst->getArgOperand(0);
+        if (llvm::isa<llvm::Constant>(operand)) {
            llvm::ReplaceInstWithValue(i->getParent()->getInstList(), i, LLVMTrue);
            modifiedAny = true;
            goto restart;
--- a/parse.yy
+++ b/parse.yy
@@ -564,9 +564,11 @@ struct_or_union_specifier
      { 
          std::vector<const Type *> elementTypes;
          std::vector<std::string> elementNames;
-          GetStructTypesAndNames(*$4, &elementTypes, &elementNames);
+          std::vector<SourcePos> elementPositions;
+          GetStructTypesNamesPositions(*$4, &elementTypes, &elementNames,
+                                       &elementPositions);
          StructType *st = new StructType($2, elementTypes, elementNames,
-                                          false, true, @2);
+                                          elementPositions, false, true, @2);
          m->symbolTable->AddType($2, st, @2);
          $$ = st;
      }
@@ -574,8 +576,11 @@ struct_or_union_specifier
      {
          std::vector<const Type *> elementTypes;
          std::vector<std::string> elementNames;
-          GetStructTypesAndNames(*$3, &elementTypes, &elementNames);
-          $$ = new StructType("", elementTypes, elementNames, false, true, @1);
+          std::vector<SourcePos> elementPositions;
+          GetStructTypesNamesPositions(*$3, &elementTypes, &elementNames,
+                                       &elementPositions);
+          $$ = new StructType("", elementTypes, elementNames, elementPositions,
+                              false, true, @1);
      }
    | struct_or_union '{' '}' 
      {
--- a/run_tests.sh
+++ b/run_tests.sh
@@ -1,40 +1,86 @@
-#!/bin/zsh
+#!/bin/bash

 surprises=0
+verbose=false
+number=$(ls -1 tests/*.ispc|wc -l)
+counter=1
+target=sse4

-echo Running correctness tests
+while getopts ":vth" opt;do
+    case $opt in
+        v) verbose=true
+            ;;
+        t) target=$OPTARG
+            ;;
+        h) cat <<EOF
+           usage: run_tests.sh [-v] [-t target] [filenames]
+                  -v           # verbose output
+                  -t           # specify compilation target (SSE4 is the default).
+                  [filenames]  # (optional) files to run through testing infrastructure
+                               # if none are provided, all in tests/ will be run.
+EOF
+            exit 1
+    esac
+done

-for i in tests/*.ispc; do
-    bc=${i%%ispc}bc
-    ispc -O2 $i -woff -o $bc --emit-llvm --target=sse4
-    if [[ $? != 0 ]]; then
-        surprises=1
-        echo Test $i FAILED ispc compile
-        echo
-    else
-        ispc_test $bc
+shift $(( $OPTIND - 1 ))
+if [[ "$1" > 0 ]]; then
+    while [[ "$1" > 0 ]]; do
+        i=$1
+        shift
+        echo Running test $i
+
+        bc=${i%%ispc}bc
+        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
        if [[ $? != 0 ]]; then
            surprises=1
-            echo Test $i FAILED ispc_test
+            echo Test $i FAILED ispc compile
+            echo
+        else
+            ispc_test $bc
+            if [[ $? != 0 ]]; then
+                surprises=1
+                echo Test $i FAILED ispc_test
+                echo
+            fi
+        fi
+        /bin/rm $bc
+    done
+else
+    echo Running all correctness tests
+
+    for i in tests/*.ispc; do
+        if $verbose; then
+            echo -en "Running test $counter of $number.\r"
+    fi
+        (( counter++ ))
+        bc=${i%%ispc}bc
+        ispc -O2 $i -woff -o $bc --emit-llvm --target=$target
+        if [[ $? != 0 ]]; then
+            surprises=1
+            echo Test $i FAILED ispc compile
+            echo
+        else
+            ispc_test $bc
+            if [[ $? != 0 ]]; then
+                surprises=1
+                echo Test $i FAILED ispc_test
+                echo
+            fi
+        fi
+        /bin/rm $bc
+    done
+
+    echo Running failing tests
+    for i in failing_tests/*.ispc; do
+        (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
+        if [[ $? == 0 ]]; then
+            surprises=1
+            echo Test $i UNEXPECTEDLY PASSED
            echo
        fi
-#        cmp $bc tests_bitcode${bc##tests}
-#        if [[ $? == 0 ]]; then
-#            /bin/rm $bc
-#        fi
-    fi
-    /bin/rm $bc
-done
-
-echo Running failing tests
-for i in failing_tests/*.ispc; do
-    (ispc -O2 $i -woff -o - --emit-llvm | ispc_test -) 2>/dev/null 1>/dev/null
-    if [[ $? == 0 ]]; then
-        surprises=1
-        echo Test $i UNEXPECTEDLY PASSED
-        echo
-    fi
-done
+    done
+fi

 if [[ $surprises == 0 ]]; then
    echo No surprises.
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -525,12 +525,53 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
 }


-define void @__masked_store_blend_64(<8 x i64>* nocapture, <8 x i64>,
-                                     <8 x i32>) nounwind alwaysinline {
-  ; always just serialize it
-  ; FIXME: should implement the "do two 32-bit masked stores" stuff that
-  ; other targets do...
-  call void @__masked_store_64(<8 x i64>* nocapture %0, <8 x i64> %1, <8 x i32> %2)
+define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
+                                     <8 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <8 x i64>* %ptr, align 8
+  %mask = bitcast <8 x i32> %i32mask to <8 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %old01f,
+                                                       <8 x float> %new01f,
+                                                       <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+  ; and again
+  %old23  = shufflevector <8 x i64> %oldValue, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %old23f = bitcast <4 x i64> %old23 to <8 x float>
+  %new23  = shufflevector <8 x i64> %new, <8 x i64> undef,
+                          <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %new23f = bitcast <4 x i64> %new23 to <8 x float>
+  ; compute mask--note that the values are doubled-up...
+  %mask23 = shufflevector <8 x float> %mask, <8 x float> undef,
+                          <8 x i32> <i32 4, i32 4, i32 5, i32 5,
+                                     i32 6, i32 6, i32 7, i32 7>
+  ; and blend them
+  %result23f = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %old23f,
+                                                       <8 x float> %new23f,
+                                                       <8 x float> %mask23)
+  %result23 = bitcast <8 x float> %result23f to <4 x i64>
+
+  ; reconstruct the final <8 x i64> vector
+  %final = shufflevector <4 x i64> %result01, <4 x i64> %result23,
+                         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+                                    i32 4, i32 5, i32 6, i32 7>
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }

--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -81,6 +81,54 @@ static inline uniform unsigned int64 intbits(uniform double d) {
    return __intbits_uniform_double(d);
 }

+static inline float broadcast(float v, uniform int i) {
+    return __broadcast_float(v, i);
+}
+
+static inline int32 broadcast(int32 v, uniform int i) {
+    return __broadcast_int32(v, i);
+}
+
+static inline double broadcast(double v, uniform int i) {
+    return __broadcast_double(v, i);
+}
+
+static inline int64 broadcast(int64 v, uniform int i) {
+    return __broadcast_int64(v, i);
+}
+
+static inline float rotate(float v, uniform int i) {
+    return __rotate_float(v, i);
+}
+
+static inline int32 rotate(int32 v, uniform int i) {
+    return __rotate_int32(v, i);
+}
+
+static inline double rotate(double v, uniform int i) {
+    return __rotate_double(v, i);
+}
+
+static inline int64 rotate(int64 v, uniform int i) {
+    return __rotate_int64(v, i);
+}
+
+static inline float shuffle(float v, int i) {
+    return __shuffle_float(v, i);
+}
+
+static inline int32 shuffle(int32 v, int i) {
+    return __shuffle_int32(v, i);
+}
+
+static inline double shuffle(double v, int i) {
+    return __shuffle_double(v, i);
+}
+
+static inline int64 shuffle(int64 v, int i) {
+    return __shuffle_int64(v, i);
+}
+
 // x[i]
 static inline uniform float extract(float x, uniform int i) {
    return __extract(x, i);
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -34,6 +34,8 @@
 ;; builtins for various targets can use macros from this file to simplify
 ;; generating code for their implementations of those builtins.

+declare i1 @__is_compile_time_constant_int32(i32)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


@@ -284,6 +286,22 @@ ret <8 x float> %ret
 '
 )

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; forloop macro
+
+divert(`-1')
+# forloop(var, from, to, stmt) - improved version:
+#   works even if VAR is not a strict macro name
+#   performs sanity check that FROM is larger than TO
+#   allows complex numerical expressions in TO and FROM
+define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
+  `pushdef(`$1', eval(`$2'))_$0(`$1',
+    eval(`$3'), `$4')popdef(`$1')')')
+define(`_forloop',
+  `$3`'ifelse(indir(`$1'), `$2', `',
+    `define(`$1', incr(indir(`$1')))$0($@)')')
+divert`'dnl
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; stdlib_core
 ;;
@@ -291,8 +309,67 @@ ret <8 x float> %ret
 ;; target's vector width, which it takes as its first parameter.
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

+define(`shuffles', `
+define internal <$1 x $2> @__broadcast_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+  %v = extractelement <$1 x $2> %0, i32 %1
+  %r_0 = insertelement <$1 x $2> undef, $2 %v, i32 0
+forloop(i, 1, eval($1-1), `  %r_`'i = insertelement <$1 x $2> %r_`'eval(i-1), $2 %v, i32 i
+')
+  ret <$1 x $2> %r_`'eval($1-1)
+}
+
+define internal <$1 x $2> @__rotate_$3(<$1 x $2>, i32) nounwind readnone alwaysinline {
+  %isc = call i1 @__is_compile_time_constant_int32(i32 %1)
+  br i1 %isc, label %is_const, label %not_const
+
+is_const:
+  ; though verbose, this turms into tight code if %1 is a constant
+forloop(i, 0, eval($1-1), `  
+  %delta_`'i = add i32 %1, i
+  %delta_clamped_`'i = and i32 %delta_`'i, eval($1-1)
+  %v_`'i = extractelement <$1 x $2> %0, i32 %delta_clamped_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+
+not_const:
+  ; store two instances of the vector into memory
+  %ptr = alloca <$1 x $2>, i32 2
+  %ptr0 = getelementptr <$1 x $2> * %ptr, i32 0
+  store <$1 x $2> %0, <$1 x $2> * %ptr0
+  %ptr1 = getelementptr <$1 x $2> * %ptr, i32 1
+  store <$1 x $2> %0, <$1 x $2> * %ptr1
+
+  ; compute offset in [0,vectorwidth-1], then index into the doubled-up vector
+  %offset = and i32 %1, eval($1-1)
+  %ptr_as_elt_array = bitcast <$1 x $2> * %ptr to [eval(2*$1) x $2] *
+  %load_ptr = getelementptr [eval(2*$1) x $2] * %ptr_as_elt_array, i32 0, i32 %offset
+  %load_ptr_vec = bitcast $2 * %load_ptr to <$1 x $2> *
+  %result = load <$1 x $2> * %load_ptr_vec, align $4
+  ret <$1 x $2> %result
+}
+
+define internal <$1 x $2> @__shuffle_$3(<$1 x $2>, <$1 x i32>) nounwind readnone alwaysinline {
+forloop(i, 0, eval($1-1), `  
+  %index_`'i = extractelement <$1 x i32> %1, i32 i')
+forloop(i, 0, eval($1-1), `  
+  %v_`'i = extractelement <$1 x $2> %0, i32 %index_`'i')
+
+  %ret_0 = insertelement <$1 x $2> undef, $2 %v_0, i32 0
+forloop(i, 1, eval($1-1), `  %ret_`'i = insertelement <$1 x $2> %ret_`'eval(i-1), $2 %v_`'i, i32 i
+')
+  ret <$1 x $2> %ret_`'eval($1-1)
+}
+
+')
+
+
 define(`stdlib_core', `

+declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops

@@ -307,6 +384,10 @@ define internal <$1 x float> @__insert(<$1 x float>, i32,
  ret <$1 x float> %insert
 }

+shuffles($1, float, float, 4)
+shuffles($1, i32, int32, 4)
+shuffles($1, double, double, 8)
+shuffles($1, i64, int64, 8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; various bitcasts from one type to another
@@ -524,7 +605,6 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
 ;; FIXME: use the per_lane macro, defined below, to implement these!

 define(`packed_load_and_store', `
-declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask)

 define i32 @__packed_load_active([0 x i32] *, i32 %start_offset, <$1 x i32> * %val_ptr,
                                 <$1 x i32> %full_mask) nounwind alwaysinline {
@@ -661,19 +741,6 @@ done:
 ;;       Inside this code, any instances of the text "LANE" are replaced
 ;;       with an i32 value that represents the current lane number

-divert(`-1')
-# forloop(var, from, to, stmt) - improved version:
-#   works even if VAR is not a strict macro name
-#   performs sanity check that FROM is larger than TO
-#   allows complex numerical expressions in TO and FROM
-define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1',
-  `pushdef(`$1', eval(`$2'))_$0(`$1',
-    eval(`$3'), `$4')popdef(`$1')')')
-define(`_forloop',
-  `$3`'ifelse(indir(`$1'), `$2', `',
-    `define(`$1', incr(indir(`$1')))$0($@)')')
-divert`'dnl
-
 ; num lanes, mask, code block to do per lane
 define(`per_lane', `
  br label %pl_entry
--- a/stmt.cpp
+++ b/stmt.cpp
@@ -178,88 +178,59 @@ lInitSymbol(llvm::Value *lvalue, const char *symName, const Type *type,
        return;
    }

-    // There are two cases for initializing arrays and vectors; either a single
-    // initializer may be provided (float foo[3] = 0;), in which case all
-    // of the array elements are initialized to the given value, or an
-    // initializer list may be provided (float foo[3] = { 1,2,3 }), in
-    // which case the array elements are initialized with the corresponding
+    // There are two cases for initializing structs, arrays and vectors;
+    // either a single initializer may be provided (float foo[3] = 0;), in
+    // which case all of the elements are initialized to the given value,
+    // or an initializer list may be provided (float foo[3] = { 1,2,3 }),
+    // in which case the elements are initialized with the corresponding
    // values.
-    const SequentialType *seqType = dynamic_cast<const SequentialType *>(type);
-    if (seqType != NULL) {
-        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
-        if (exprList == NULL) {
-            // We have single expression; loop over the elements of the
-            // array/vector and initialize each of them with it
-            // individually.
-            for (int i = 0; i < seqType->GetElementCount(); ++i) {
-                llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
-                lInitSymbol(ptr, symName, seqType->GetElementType(), initExpr, 
-                            ctx, pos);
-            }
-        }
-        else {
-            // Otherwise make sure that we have the same number of elements
-            // in the { } initializer expression as we have in the
-            // array/vector
-            int nInits = exprList->exprs.size();
-            if (nInits != seqType->GetElementCount()) {
-                const char *actualType = dynamic_cast<const ArrayType *>(type) ? 
-                    "Array" : "Vector";
-                Error(initExpr->pos, "%s initializer for variable \"%s\" requires "
-                      "%d values; %d provided.", actualType, symName, 
-                      seqType->GetElementCount(), nInits);
-            }
-            else {
-                // And initialize each of the array/vector elements with
-                // the corresponding expression from the ExprList
-                for (int i = 0; i < nInits; ++i) {
-                    llvm::Value *ptr = ctx->GetElementPtrInst(lvalue, 0, i, "offset");
-                    lInitSymbol(ptr, symName, seqType->GetElementType(), 
-                                exprList->exprs[i], ctx, pos);
-                }
-            }
-        }
-        return;
-    }
+    const CollectionType *collectionType = 
+        dynamic_cast<const CollectionType *>(type);
+    if (collectionType != NULL) {
+        std::string name;
+        if (dynamic_cast<const StructType *>(type) != NULL)
+            name = "struct";
+        else if (dynamic_cast<const ArrayType *>(type) != NULL) 
+            name = "array";
+        else if (dynamic_cast<const VectorType *>(type) != NULL) 
+            name = "vector";
+        else 
+            FATAL("Unexpected CollectionType in lInitSymbol()");

-    // Structs can similarly be initialized in one of two ways; either with
-    // a list of expressions in braces, one expression per struct member,
-    // or with a single expression that is used to initialize all struct
-    // members.
-    const StructType *st = dynamic_cast<const StructType *>(type);
-    if (st) {
        ExprList *exprList = dynamic_cast<ExprList *>(initExpr);
        if (exprList != NULL) {
            // The { ... } case; make sure we have the same number of
            // expressions in the ExprList as we have struct members
            int nInits = exprList->exprs.size();
-            if (nInits != st->NumElements())
-                Error(initExpr->pos, 
-                      "Initializer for struct \"%s\" requires %d values; %d provided.",
-                      symName, st->NumElements(), nInits);
-            else {
-                // Initialize each struct member with the corresponding
-                // value from the ExprList
-                for (int i = 0; i < nInits; ++i) {
-                    llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
-                    lInitSymbol(ep, symName, st->GetMemberType(i), exprList->exprs[i],
-                                ctx, pos);
-                }
+            if (nInits != collectionType->GetElementCount()) {
+                Error(initExpr->pos, "Initializer for %s \"%s\" requires "
+                      "%d values; %d provided.", name.c_str(), symName, 
+                      collectionType->GetElementCount(), nInits);
+                return;
+            }
+
+            // Initialize each element with the corresponding value from
+            // the ExprList
+            for (int i = 0; i < nInits; ++i) {
+                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "element");
+                lInitSymbol(ep, symName, collectionType->GetElementType(i), 
+                            exprList->exprs[i], ctx, pos);
            }
        }
        else if (initExpr->GetType()->IsNumericType() ||
                 initExpr->GetType()->IsBoolType()) {
-            // Otherwise initialize all of the struct elements in turn with
-            // the initExpr.
-            for (int i = 0; i < st->NumElements(); ++i) {
-                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "structelement");
-                lInitSymbol(ep, symName, st->GetMemberType(i), initExpr, ctx, pos);
+            // Otherwise initialize all of the elements in turn with the
+            // initExpr.
+            for (int i = 0; i < collectionType->GetElementCount(); ++i) {
+                llvm::Value *ep = ctx->GetElementPtrInst(lvalue, 0, i, "element");
+                lInitSymbol(ep, symName, collectionType->GetElementType(i), 
+                            initExpr, ctx, pos);
            }
        }
        else {
            Error(initExpr->pos, "Can't assign type \"%s\" to \"%s\".",
                  initExpr->GetType()->GetString().c_str(),
-                  st->GetString().c_str());
+                  collectionType->GetString().c_str());
        }
        return;
    }
--- a/tests/broadcast-1.ispc
+++ b/tests/broadcast-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int br = broadcast(a, (uniform int)b-2);
+    RET[programIndex] = br;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 4;
+}
--- a/tests/broadcast.ispc
+++ b/tests/broadcast.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = aFOO[programIndex]; 
+    float b = broadcast(a, 2);
+    RET[programIndex] = b;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
--- a/tests/rotate-1.ispc
+++ b/tests/rotate-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = rotate(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
--- a/tests/rotate-2.ispc
+++ b/tests/rotate-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int rot = rotate(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
--- a/tests/rotate-3.ispc
+++ b/tests/rotate-3.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int64 a = aFOO[programIndex]; 
+    uniform int delta = b - 6; // -1
+    int64 rot = rotate(a, delta);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
--- a/tests/rotate-4.ispc
+++ b/tests/rotate-4.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int64 a = aFOO[programIndex]; 
+    int64 rot = rotate(a, -1);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + programCount - 1) % programCount;
+}
--- a/tests/rotate.ispc
+++ b/tests/rotate.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int a = aFOO[programIndex]; 
+    int rot = rotate(a, 2);
+    RET[programIndex] = rot;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + (programIndex + 2) % programCount;
+}
--- a/tests/shuffle-1.ispc
+++ b/tests/shuffle-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex;
+    float shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
--- a/tests/shuffle-2.ispc
+++ b/tests/shuffle-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float a = aFOO[programIndex]; 
+    int reverse = programCount - 1 - programIndex + (int)b - 5;
+    float shuf = shuffle(a, reverse);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programCount - programIndex;
+}
--- a/tests/shuffle.ispc
+++ b/tests/shuffle.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    int32 a = aFOO[programIndex]; 
+    int32 shuf = shuffle(a, 1);
+    RET[programIndex] = shuf;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2;
+}
--- a/type.cpp
+++ b/type.cpp
@@ -410,6 +410,14 @@ AtomicType::GetDIType(llvm::DIDescriptor scope) const {
 }


+///////////////////////////////////////////////////////////////////////////
+// SequentialType
+
+const Type *SequentialType::GetElementType(int index) const {
+    return GetElementType();
+}
+
+
 ///////////////////////////////////////////////////////////////////////////
 // ArrayType

@@ -961,9 +969,10 @@ VectorType::getVectorMemoryCount() const {

 StructType::StructType(const std::string &n, const std::vector<const Type *> &elts, 
                       const std::vector<std::string> &en,
+                       const std::vector<SourcePos> &ep,
                       bool ic, bool iu, SourcePos p) 
-    : name(n), elementTypes(elts), elementNames(en), isUniform(iu), isConst(ic), 
-      pos(p) {
+    : name(n), elementTypes(elts), elementNames(en), elementPositions(ep),
+      isUniform(iu), isConst(ic), pos(p) {
 }


@@ -1014,8 +1023,8 @@ StructType::GetAsVaryingType() const {
    if (IsVaryingType()) 
        return this;
    else
-        return new StructType(name, elementTypes, elementNames, isConst,
-                              false, pos);
+        return new StructType(name, elementTypes, elementNames, elementPositions,
+                              isConst, false, pos);
 }


@@ -1024,8 +1033,8 @@ StructType::GetAsUniformType() const {
    if (IsUniformType()) 
        return this;
    else
-        return new StructType(name, elementTypes, elementNames, isConst,
-                              true, pos);
+        return new StructType(name, elementTypes, elementNames, elementPositions,
+                              isConst, true, pos);
 }


@@ -1034,11 +1043,12 @@ StructType::GetSOAType(int width) const {
    std::vector<const Type *> et;
    // The SOA version of a structure is just a structure that holds SOAed
    // versions of its elements
-    for (int i = 0; i < NumElements(); ++i) {
-        const Type *t = GetMemberType(i);
+    for (int i = 0; i < GetElementCount(); ++i) {
+        const Type *t = GetElementType(i);
        et.push_back(t->GetSOAType(width));
    }
-    return new StructType(name, et, elementNames, isConst, isUniform, pos);
+    return new StructType(name, et, elementNames, elementPositions,
+                          isConst, isUniform, pos);
 }


@@ -1047,8 +1057,8 @@ StructType::GetAsConstType() const {
    if (IsConstType()) 
        return this;
    else
-        return new StructType(name, elementTypes, elementNames, true,
-                              isUniform, pos);
+        return new StructType(name, elementTypes, elementNames, 
+                              elementPositions, true, isUniform, pos);
 }


@@ -1057,8 +1067,8 @@ StructType::GetAsNonConstType() const {
    if (!IsConstType()) 
        return this;
    else
-        return new StructType(name, elementTypes, elementNames, false,
-                              isUniform, pos);
+        return new StructType(name, elementTypes, elementNames, elementPositions,
+                              false, isUniform, pos);
 }


@@ -1123,8 +1133,8 @@ StructType::GetCDeclaration(const std::string &n) const {
 const llvm::Type *
 StructType::LLVMType(llvm::LLVMContext *ctx) const {
    std::vector<const llvm::Type *> llvmTypes;
-    for (int i = 0; i < NumElements(); ++i) {
-        const Type *type = GetMemberType(i);
+    for (int i = 0; i < GetElementCount(); ++i) {
+        const Type *type = GetElementType(i);
        llvmTypes.push_back(type->LLVMType(ctx));
    }
    return llvm::StructType::get(*ctx, llvmTypes);
@@ -1138,14 +1148,13 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
    return llvm::DIType();
 #else
    uint64_t currentSize = 0, align = 0;
-    llvm::DIFile diFile = pos.GetDIFile();

    std::vector<llvm::Value *> elementLLVMTypes;
    // Walk through the elements of the struct; for each one figure out its
    // alignment and size, using that to figure out its offset w.r.t. the
    // start of the structure.
    for (unsigned int i = 0; i < elementTypes.size(); ++i) {
-        llvm::DIType eltType = GetMemberType(i)->GetDIType(scope);
+        llvm::DIType eltType = GetElementType(i)->GetDIType(scope);
        uint64_t eltAlign = eltType.getAlignInBits();
        uint64_t eltSize = eltType.getSizeInBits();

@@ -1159,12 +1168,19 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
            currentSize += eltAlign - (currentSize % eltAlign);
        assert((currentSize == 0) || (currentSize % eltAlign) == 0);

-        // FIXME: we should pass this actual file/line number for the
-        // member, not the position of the struct declaration
+        llvm::DIFile diFile = elementPositions[i].GetDIFile();
+        int line = elementPositions[i].first_line;
+#ifdef LLVM_2_9
        llvm::DIType fieldType = 
-            m->diBuilder->createMemberType(elementNames[i], diFile, pos.first_line,
+            m->diBuilder->createMemberType(elementNames[i], diFile, line,
                                           eltSize, eltAlign, currentSize, 0,
                                           eltType);
+#else
+        llvm::DIType fieldType = 
+            m->diBuilder->createMemberType(scope, elementNames[i], diFile, 
+                                           line, eltSize, eltAlign, 
+                                           currentSize, 0, eltType);
+#endif // LLVM_2_9
        elementLLVMTypes.push_back(fieldType);

        currentSize += eltSize;
@@ -1181,6 +1197,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {
 #else
    llvm::DIArray elements = m->diBuilder->getOrCreateArray(elementLLVMTypes);
 #endif
+    llvm::DIFile diFile = pos.GetDIFile();
    return m->diBuilder->createStructType(scope, name, diFile, pos.first_line, currentSize, 
                                          align, 0, elements);
 #endif // LLVM_2_8
@@ -1188,7 +1205,7 @@ StructType::GetDIType(llvm::DIDescriptor scope) const {


 const Type *
-StructType::GetMemberType(int i) const {
+StructType::GetElementType(int i) const {
    assert(i < (int)elementTypes.size());
    // If the struct is uniform qualified, then each member comes out with
    // the same type as in the original source file.  If it's varying, then
@@ -1200,7 +1217,7 @@ StructType::GetMemberType(int i) const {


 const Type *
-StructType::GetMemberType(const std::string &n) const {
+StructType::GetElementType(const std::string &n) const {
    for (unsigned int i = 0; i < elementNames.size(); ++i)
        if (elementNames[i] == n) {
            const Type *ret = isUniform ? elementTypes[i] : 
@@ -1212,7 +1229,7 @@ StructType::GetMemberType(const std::string &n) const {


 int
-StructType::GetMemberNumber(const std::string &n) const {
+StructType::GetElementNumber(const std::string &n) const {
    for (unsigned int i = 0; i < elementNames.size(); ++i)
        if (elementNames[i] == n)
            return i;
@@ -1766,10 +1783,10 @@ Type::Equal(const Type *a, const Type *b) {
    const StructType *sta = dynamic_cast<const StructType *>(a);
    const StructType *stb = dynamic_cast<const StructType *>(b);
    if (sta && stb) {
-        if (sta->NumElements() != stb->NumElements())
+        if (sta->GetElementCount() != stb->GetElementCount())
            return false;
-        for (int i = 0; i < sta->NumElements(); ++i)
-            if (!Equal(sta->GetMemberType(i), stb->GetMemberType(i)))
+        for (int i = 0; i < sta->GetElementCount(); ++i)
+            if (!Equal(sta->GetElementType(i), stb->GetElementType(i)))
                return false;
        return true;
    }
--- a/type.h
+++ b/type.h
@@ -243,19 +243,42 @@ private:
 };


-/** @brief Abstract base class for tpyes that represent sequences
+/** @brief Abstract base class for types that represent collections of
+    other types.
+
+    This is a common base class that StructTypes, ArrayTypes, and
+    VectorTypes all inherit from.
+*/ 
+class CollectionType : public Type {
+public:
+    /** Returns the total number of elements in the collection. */
+    virtual int GetElementCount() const = 0;
+
+    /** Returns the type of the element given by index.  (The value of
+        index must be between 0 and GetElementCount()-1.
+     */
+    virtual const Type *GetElementType(int index) const = 0;
+};
+
+
+/** @brief Abstract base class for types that represent sequences

    SequentialType is an abstract base class that adds interface routines
    for types that represent linear sequences of other types (i.e., arrays
    and vectors).
 */
-class SequentialType : public Type {
+class SequentialType : public CollectionType {
 public:
-    /** Returns the total number of elements in the sequence. */
-    virtual int GetElementCount() const = 0;
-
-    /** Returns the Type of the elements that the sequence stores. */
+    /** Returns the Type of the elements that the sequence stores; for
+        SequentialTypes, all elements have the same type . */
    virtual const Type *GetElementType() const = 0;
+
+    /** SequentialType provides an implementation of this CollectionType
+        method, just passing the query on to the GetElementType(void)
+        implementation, since all of the elements of a SequentialType have
+        the same type.
+     */
+    const Type *GetElementType(int index) const;
 };


@@ -439,10 +462,11 @@ private:

 /** @brief Representation of a structure holding a number of members.
 */
-class StructType : public Type {
+class StructType : public CollectionType {
 public:
    StructType(const std::string &name, const std::vector<const Type *> &elts, 
-               const std::vector<std::string> &eltNames, bool isConst, 
+               const std::vector<std::string> &eltNames, 
+               const std::vector<SourcePos> &eltPositions, bool isConst, 
               bool isUniform, SourcePos pos);

    bool IsUniformType() const;
@@ -468,21 +492,21 @@ public:

    /** Returns the type of the structure element with the given name (if any).
        Returns NULL if there is no such named element. */
-    const Type *GetMemberType(const std::string &name) const;
+    const Type *GetElementType(const std::string &name) const;

    /** Returns the type of the i'th structure element.  The value of \c i must
        be between 0 and NumElements()-1. */
-    const Type *GetMemberType(int i) const;
+    const Type *GetElementType(int i) const;

    /** Returns which structure element number (starting from zero) that
        has the given name.  If there is no such element, return -1. */
-    int GetMemberNumber(const std::string &name) const;
+    int GetElementNumber(const std::string &name) const;

    /** Returns the name of the i'th element of the structure. */
    const std::string GetElementName(int i) const { return elementNames[i]; }
    
    /** Returns the total number of elements in the structure. */
-    int NumElements() const { return int(elementTypes.size()); }
+    int GetElementCount() const { return int(elementTypes.size()); }

    /** Returns the name of the structure type.  (e.g. struct Foo -> "Foo".) */
    const std::string &GetStructName() const { return name; }
@@ -501,6 +525,9 @@ private:
     */
    const std::vector<const Type *> elementTypes;
    const std::vector<std::string> elementNames;
+    /** Source file position at which each structure element declaration
+        appeared. */
+    const std::vector<SourcePos> elementPositions;
    const bool isUniform;
    const bool isConst;
    const SourcePos pos;
Author	SHA1	Message	Date
Matt Pharr	32764e7639	Update release notes, doxygen version number	2011-07-01 05:12:57 +01:00
Matt Pharr	bcae21dbca	Update examples to use fpmath:fast and to enable intrinsics on Windows	2011-06-30 13:17:14 -07:00
Matt Pharr	eb22fa6173	Generalize FunctionEmitContext::PtrToIntInst and IntToPtrInst to do the right thing if given a varying lvalue (i.e. an array of pointers). Fixes issue #34.	2011-06-29 12:38:12 +01:00
Matt Pharr	5f7e61f9b5	Another stdlib dependency improvement	2011-06-29 12:26:44 +01:00
Matt Pharr	28a68e3c1f	More code simplifications from using CollectionType. Finishes Issue #37	2011-06-29 09:32:31 +01:00
Matt Pharr	6b153566f3	Simplify a bunch of code by using CollectionType to collect struct codepaths in with array/vector codepaths. (Issue #37).	2011-06-29 07:59:43 +01:00
Matt Pharr	214fb3197a	Initial plumbing to add CollectionType base-class as common ancestor to StructTypes, ArrayTypes, and VectorTypes. Issue #37.	2011-06-29 07:42:09 +01:00
Matt Pharr	b4068efcfb	Fixes to run_tests.sh script - Use bash, not zsh (don't make people install zsh for no good reason) - Print help if -h command line option is given - Allow specifying the compilation target to use on the command line - If one or more filenames are provided, just run those tests. Otherwise, run everything in the tests/ directory.	2011-06-29 07:25:01 +01:00
Matt Pharr	24216d841f	Update release notes for 1.0.2 stuff so far	2011-06-29 07:00:17 +01:00
Matt Pharr	be45beb54b	Implement our own routine to turn C99-style hexadecimal float constants in strong form into floating-point values. With this, we can correctly handle hex float constants on Windows, where the builtin atof() routine just returns zero for them. Fixes issue #16 .	2011-06-29 06:57:39 +01:00
Matt Pharr	cb58c78c1a	Pipe through source file locations of structure element declarations; these are now supplied to the llvm::DIBuilder::createMemberType() method rather than giving it the position of the overall struct declaration for each one. Fixes issue #31	2011-06-29 05:38:42 +01:00
Matt Pharr	86de910ecd	Improve implementation of __masked_store_blend_64() for AVX target by doing two 8-wide 32-bit blends rather than serializing. Fixes issue #29	2011-06-28 20:52:06 -07:00
Matt Pharr	ce7978ae74	Align stack-allocated arrays of uniform types to the target vector alignment (they will often be accessed in programCount-sized chunks and this should make that a bit more efficient in the common case). Fixes issue #15	2011-06-28 20:42:18 -07:00
Matt Pharr	7aec7486f8	Make SSE2 the default target on Atom CPUs unless explicitly overridden. (Fixes issue #45	2011-06-28 08:32:58 -07:00
Daniel Schubert	b6d6ee6fc2	Fixed typos.	2011-06-28 07:38:00 -07:00
Matt Pharr	cb74346d36	Fix typo (thx jsimmons)	2011-06-27 19:51:46 -07:00
Matt Pharr	2709c354d7	Add support for broadcast(), rotate(), and shuffle() stdlib routines	2011-06-27 17:31:44 -07:00
Matt Pharr	36063bae79	Update call to llvm::DIBuilder::createMemberType to fix building with LLVM dev TOT	2011-06-26 08:00:00 -07:00
Matt Pharr	e6d6a82484	Merge pull request #41 from benharper123/master Update docs on store/load int8/16	2011-06-25 17:28:21 -07:00
Ben Harper	f830e21cfa	Updated docs for store/load int8/int16	2011-06-26 02:02:18 +02:00
Matt Pharr	ae2c24c3c1	Merge branch 'master' of github.com:ispc/ispc	2011-06-24 17:06:08 -07:00
Andreas Wendleder	6dfd74c74c	Add verbose flag and report progress.	2011-06-24 17:05:24 -07:00
Matt Pharr	7055888cb7	Merge branch 'master' of github.com:ispc/ispc	2011-06-24 16:21:54 -07:00
Matt Pharr	7854a71ea9	Merge branch 'master' of github.com:ispc/ispc	2011-06-24 16:21:06 -07:00
Matt Pharr	b7519d1268	fix date in ReleaseNotes.txt	2011-06-24 16:20:36 -07:00
Matt Pharr	f2758f0831	Merge branch 'master' of github.com:ispc/ispc	2011-06-24 16:20:06 -07:00
Matt Pharr	ff76c2334e	small doc fix, removed incorrect comment from example	2011-06-24 16:19:51 -07:00
Matt Pharr	9b6bf5dabc	Add release notes doc	2011-06-24 16:11:46 -07:00
Matt Pharr	ab33afaea4	Merge branch 'master' of home:/Users/mmp/git/ispc	2011-06-23 18:54:14 -07:00
Matt Pharr	fab5794faf	Merge branch 'master' of github.com:ispc/ispc	2011-06-23 18:25:44 -07:00
Matt Pharr	3c3cd88692	initial alignment work	2011-06-23 17:36:44 -07:00