Merge branch 'master' of /Users/mmp/git/ispc

Finished updating alignment issues for vector types; don't assume pointers
are aligned to the natural vector width.
2011-06-24 05:11:06 -07:00 · 2011-06-23 18:51:15 -07:00 · 2011-06-23 18:21:02 -07:00 · 2011-06-23 18:18:33 -07:00 · 2011-06-23 16:10:03 -07:00 · 2011-06-23 16:06:38 -07:00
33 changed files with 146 additions and 234 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 *.pyc
 *~
+depend
+ispc
+ispc_test
+objs
--- a/13
+++ b/13
@@ -43,7 +43,7 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdli

 default: ispc ispc_test

-.PHONY: dirs clean depend doxygen
+.PHONY: dirs clean depend doxygen print_llvm_src
 .PRECIOUS: objs/stdlib-%.cpp

 depend: $(CXX_SRC) $(HEADERS)
@@ -56,6 +56,9 @@ dirs:
 	@echo Creating objs/ directory
 	@/bin/mkdir -p objs

+print_llvm_src:
+	@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
+
 clean:
 	/bin/rm -rf objs ispc ispc_test

@@ -63,7 +66,7 @@ doxygen:
 	/bin/rm -rf docs/doxygen
 	doxygen doxygen.cfg

-ispc: dirs $(OBJS)
+ispc: print_llvm_src dirs $(OBJS)
 	@echo Creating ispc executable
 	@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)

@@ -83,11 +86,11 @@ objs/parse.o: objs/parse.cc $(HEADERS)
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

-objs/lex.cpp: lex.ll
+objs/lex.cpp: lex.ll 
 	@echo Running flex on $<
 	@$(LEX) -o $@ $<

-objs/lex.o: objs/lex.cpp $(HEADERS)
+objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<

@@ -111,7 +114,7 @@ objs/stdlib-c.o: objs/stdlib-c.cpp

 objs/stdlib_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $<
-	@$(CPP) -DISPC=1 -DPI=3.1415936535 $< | ./stdlib2cpp.py > $@
+	@$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@

 objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
 	@echo Compiling $<
--- a/READMErst.txt
+++ b/READMErst.txt
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -1315,8 +1315,21 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,

    if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
        // If the lvalue is a straight up regular pointer, then just issue
-        // a regular load
-        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock);
+        // a regular load.  First figure out the alignment; in general we
+        // can just assume the natural alignment (0 here), but for varying
+        // atomic types, we need to make sure that the compiler emits
+        // unaligned vector loads, so we specify a reduced alignment here.
+        int align = 0;
+        const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
+        if (atomicType != NULL && atomicType->IsVaryingType())
+            // We actually just want to align to the vector element
+            // alignment, but can't easily get that here, so just tell LLVM
+            // it's totally unaligned.  (This shouldn't make any difference
+            // vs the proper alignment in practice.)
+            align = 1;
+        llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load",
+                                                     false /* not volatile */,
+                                                     align, bblock);
        AddDebugPos(inst);
        return inst;
    }
@@ -1437,7 +1450,7 @@ FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {

    llvm::Value *line = LLVMInt32(pos.first_line);
 #ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &first_line, 1);
+    md = llvm::MDNode::get(*g->ctx, &line, 1);
 #else
    md = llvm::MDNode::get(*g->ctx, line);
 #endif
@@ -1445,7 +1458,7 @@ FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {

    llvm::Value *column = LLVMInt32(pos.first_column);
 #ifdef LLVM_2_8
-    md = llvm::MDNode::get(*g->ctx, &first_column, 1);
+    md = llvm::MDNode::get(*g->ctx, &column, 1);
 #else
    md = llvm::MDNode::get(*g->ctx, column);
 #endif
@@ -1644,7 +1657,16 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
        return;
    }

-    llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+    llvm::Instruction *inst;
+    if (llvm::isa<llvm::VectorType>(rvalue->getType()))
+        // Specify an unaligned store, since we don't know that the lvalue
+        // will in fact be aligned to a vector width here.  (Actually
+        // should be aligned to the alignment of the vector elment type...)
+        inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                   1, bblock);
+    else
+        inst = new llvm::StoreInst(rvalue, lvalue, bblock);
+
    AddDebugPos(inst);
 }

@@ -1661,8 +1683,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,

    // Figure out what kind of store we're doing here
    if (rvalueType->IsUniformType()) {
-        // The easy case; a regular store
-        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock);
+        // The easy case; a regular store, natural alignment is fine
+        llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
        AddDebugPos(si);
    }
    else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
@@ -1672,9 +1694,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
    else if (storeMask == LLVMMaskAllOn) {
        // Otherwise it is a masked store unless we can determine that the
        // mask is all on...
-        llvm::Instruction *si = 
-            new llvm::StoreInst(rvalue, lvalue, name, bblock);
-        AddDebugPos(si);
+        StoreInst(rvalue, lvalue, name);
    }
    else
        maskedStore(rvalue, lvalue, rvalueType, storeMask);
--- a/docs/ispc.txt
+++ b/docs/ispc.txt
@@ -1970,7 +1970,7 @@ Data Layout

 In general, ``ispc`` tries to ensure that ``struct`` s and other complex
 datatypes are laid out in the same way in memory as they are in C/C++.
-Matching alignment is important for easy interoperability between C/C++
+Matching structure layout is important for easy interoperability between C/C++
 code and ``ispc`` code.

 The main complexity in sharing data between ``ispc`` and C/C++ often comes
@@ -2023,11 +2023,6 @@ It can pass ``array`` to a ``ispc`` function defined as:

   export void foo(uniform float array[], uniform int count)

-(Though the pointer must be aligned to the compilation target's natural
-vector width; see the discussion of alignment restrictions in `Data
-Alignment and Aliasing`_ and the aligned allocation routines in
-``examples/options/options.cpp`` for example.)
-
 Similarly, ``struct`` s from the application can have embedded pointers.
 This is handled with similar ``[]`` syntax:

@@ -2062,55 +2057,20 @@ vector types from C/C++ application code if possible.
 Data Alignment and Aliasing
 ---------------------------

-There are two important constraints that must be adhered to when passing
-pointers from the application to ``ispc`` programs.
+There are are two important constraints that must be adhered to when
+passing pointers from the application to ``ispc`` programs.

-The first constraint is alignment: any pointers from the host program that
-are passed to ``ispc`` must be aligned to natural vector alignment of
-system--for example, 16 byte alignment on a target that supports Intel®
-SSE, 32-byte on an Intel® AVX target.  If this constraint isn't met, the
-program may abort at runtime with an unaligned memory access error.
+The first is that it is required that it be valid to read memory at the
+first element of any array that is passed to ``ispc``.  In practice, this
+should just happen naturally, but it does mean that it is illegal to pass a
+``NULL`` pointer as a parameter to a ``ispc`` function called from the
+application.

-For example, in a ``ispc`` function with the following declaration:
-
-::
-
-    export void foo(uniform float in[], uniform float out[],
-                    int count);
-
-If the application is passing stack-allocated arrays for ``in`` and
-``out``, these C/C++ compiler must be told to align these arrays.
-
-::
-
-    // MSVC, SSE target
-    __declspec(align(16)) float in[16], out[16];
-    foo(in, out, 16);
-
-With the gcc/clang compilers, the syntax for providing alignment is
-slightly different:
-
-::
-
-    float x[16] __attribute__ ((__align__(16)));
-    foo(in, out, 16);
-
-If the data being passed is dynamically allocated, the appropriate system
-aligned memory allocation routine should be used to allocate it (for
-example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with
-Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for
-an example.)
-
-It is also required that it be valid to read memory at the first element of
-any array that is passed to ``ispc``.  In practice, this should just
-happen naturally, but it does mean that it is illegal to pass a ``NULL``
-pointer as a parameter to a ``ispc`` function called from the application.
-
-The second key constraint is that pointers and references in ``ispc``
-programs must not alias.  The ``ispc`` compiler assumes that different
-pointers can't end up pointing to the same memory location, either due to
-having the same initial value, or through array indexing in the program as
-it executed.
+The second constraint is that pointers and references in ``ispc`` programs
+must not alias.  The ``ispc`` compiler assumes that different pointers
+can't end up pointing to the same memory location, either due to having the
+same initial value, or through array indexing in the program as it
+executed.

 This aliasing constraint also applies to ``reference`` parameters to
 functions.  Given a function like:
@@ -2127,8 +2087,8 @@ another case of aliasing, and if the caller calls the function as ``func(x,
 x)``, it's not guaranteed that the ``if`` test will evaluate to true, due
 to the compiler's requirement of no aliasing.

-(In the future, ``ispc`` will have the ability to work with unaligned
-memory as well as have a mechanism to indicate that pointers may alias.)
+(In the future, ``ispc`` will have a mechanism to indicate that pointers
+may alias.)

 Using ISPC Effectively
 ======================
--- a/examples/aobench/Makefile
+++ b/examples/aobench/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math
+ISPCFLAGS=-O2 --fast-math --arch=x86-64

 default: ao

--- a/examples/aobench/ao.cpp
+++ b/examples/aobench/ao.cpp
@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
 }


-// Allocate memory with 64-byte alignment.
-float *
-AllocAligned(int size) {
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
-
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
    }

    // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];

    //
    // Run the ispc path, test_iterations times, and report the minimum
--- a/examples/aobench_instrumented/Makefile
+++ b/examples/aobench_instrumented/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --fast-math --instrument
+ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64

 default: ao

--- a/examples/aobench_instrumented/ao.cpp
+++ b/examples/aobench_instrumented/ao.cpp
@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
 }


-// Allocate memory with 64-byte alignment.
-float *
-AllocAligned(int size) {
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
-
 int main(int argc, char **argv)
 {
    if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
    }

    // Allocate space for output images
-    img = (unsigned char *)AllocAligned(width * height * 3);
-    fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
+    img = new unsigned char[width * height * 3];
+    fimg = new float[width * height * 3];

    ao_ispc(width, height, NSUBSAMPLES, fimg);

--- a/examples/mandelbrot/.gitignore
+++ b/examples/mandelbrot/.gitignore
@@ -0,0 +1,3 @@
+mandelbrot
+*.ppm
+objs
--- a/examples/mandelbrot/Makefile
+++ b/examples/mandelbrot/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: mandelbrot

--- a/examples/mandelbrot_tasks/Makefile
+++ b/examples/mandelbrot_tasks/Makefile
@@ -11,10 +11,10 @@ endif

 TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: mandelbrot

--- a/examples/options/Makefile
+++ b/examples/options/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -g -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: options

--- a/examples/options/options.cpp
+++ b/examples/options/options.cpp
@@ -37,9 +37,6 @@
 #include <assert.h>
 #include <math.h>
 #include <algorithm>
-#ifndef __APPLE__
-#include <malloc.h>
-#endif // !__APPLE__
 using std::max;

 #include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
 #include "options_ispc.h"
 using namespace ispc;

-// Allocate memory with 64-byte alignment.
-float *AllocFloats(int count) {
-    int size = count * sizeof(float);
-#if defined(_WIN32) || defined(_WIN64)
-    return (float *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (float *)amem;
-#else
-    return (float *)memalign(64, size);
-#endif
-}
-
 extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], 
                                 float ra[], float va[], 
                                 float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
 int main() {
    // Pointers passed to ispc code must have alignment of the target's
    // vector width at minimum.
-    float *S = AllocFloats(N_OPTIONS);
-    float *X = AllocFloats(N_OPTIONS);
-    float *T = AllocFloats(N_OPTIONS);
-    float *r = AllocFloats(N_OPTIONS);
-    float *v = AllocFloats(N_OPTIONS);
-    float *result = AllocFloats(N_OPTIONS);
+    float *S = new float[N_OPTIONS];
+    float *X = new float[N_OPTIONS];
+    float *T = new float[N_OPTIONS];
+    float *r = new float[N_OPTIONS];
+    float *v = new float[N_OPTIONS];
+    float *result = new float[N_OPTIONS];

    for (int i = 0; i < N_OPTIONS; ++i) {
        S[i] = 100;  // stock price
--- a/examples/rt/Makefile
+++ b/examples/rt/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2 --target=sse4x2
+ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64

 default: rt

--- a/examples/rt/rt.cpp
+++ b/examples/rt/rt.cpp
@@ -43,9 +43,6 @@
 #include <algorithm>
 #include <assert.h>
 #include <sys/types.h>
-#ifndef __APPLE__
-#include <malloc.h>
-#endif
 #include "../timing.h"
 #include "rt_ispc.h"

@@ -53,23 +50,6 @@ using namespace ispc;

 typedef unsigned int uint;

-template <typename T> 
-T *AllocAligned(int count) {
-    int size = count * sizeof(T);
-#if defined(_WIN32) || defined(_WIN64)
-    return (T *)_aligned_malloc(size, 64);
-#elif defined (__APPLE__)
-    // Allocate excess memory to ensure an aligned pointer can be returned
-    void *mem = malloc(size + (64-1) + sizeof(void*));
-    char *amem = ((char*)mem) + sizeof(void*);
-    amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
-    ((void**)amem)[-1] = mem;
-    return (T *)amem;
-#else
-    return (T *)memalign(64, size);
-#endif
-}
-
 extern void raytrace_serial(int width, int height, const float raster2camera[4][4], 
                            const float camera2world[4][4], float image[],
                            int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
    uint nNodes;
    READ(nNodes, 1);

-    LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
+    LinearBVHNode *nodes = new LinearBVHNode[nNodes];
    for (unsigned int i = 0; i < nNodes; ++i) {
        // Each node is 6x floats for a boox, then an integer for an offset
        // to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
    // And then read the triangles 
    uint nTris;
    READ(nTris, 1);
-    Triangle *triangles = AllocAligned<Triangle>(nTris);
+    Triangle *triangles = new Triangle[nTris];
    for (uint i = 0; i < nTris; ++i) {
        // 9x floats for the 3 vertices
        float v[9];
--- a/examples/simple/.gitignore
+++ b/examples/simple/.gitignore
@@ -0,0 +1,2 @@
+simple
+objs
--- a/examples/simple/Makefile
+++ b/examples/simple/Makefile
@@ -1,8 +1,8 @@

-CXX=g++
+CXX=g++ -m64
 CXXFLAGS=-Iobjs/ -O3 -Wall
 ISPC=ispc
-ISPCFLAGS=-O2
+ISPCFLAGS=-O2 --arch=x86-64

 default: simple

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -38,15 +38,7 @@
 using namespace ispc;

 int main() {
-    // Pointers passed to ispc-compiled code are currently required to have
-    // alignment equal to the target's native vector size.  Here we align
-    // to 32 bytes to be safe for both SSE and AVX targets.
-#ifdef _MSC_VER
-    __declspec(align(32)) float vin[16], vout[16];
-#else
-    float vin[16] __attribute__((aligned(32)));
-    float vout[16] __attribute__((aligned(32)));
-#endif
+    float vin[16], vout[16];

    // Initialize input buffer
    for (int i = 0; i < 16; ++i)
--- a/expr.cpp
+++ b/expr.cpp
@@ -380,7 +380,7 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
        std::vector<llvm::Constant *> vals;
        for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
            vals.push_back(constElement);
-        return llvm::ConstantVector::get(lvt, vals);
+	return llvm::ConstantVector::get(vals);
    }
    else {
        const llvm::ArrayType *lat = 
@@ -3203,6 +3203,7 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v)
        break;
    case AtomicType::TYPE_INT64:
    case AtomicType::TYPE_UINT64:
+        // For now, this should never be reached 
        FATAL("fixme; we need another constructor so that we're not trying to pass "
               "double values to init an int64 type...");
    default:
--- a/failing_tests/masked-scatter-vector.ispc
+++ b/failing_tests/masked-scatter-vector.ispc
--- a/failing_tests/scatter-vector.ispc
+++ b/failing_tests/scatter-vector.ispc
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|Win32">
@@ -179,7 +179,7 @@
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level3</WarningLevel>
      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -197,7 +197,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions>
+      <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
      <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
    </ClCompile>
@@ -213,4 +213,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -116,7 +116,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    for (int i = 0; i < target.vectorWidth; ++i)
        maskOnes.push_back(onMask);
-    LLVMMaskAllOn = llvm::ConstantVector::get(LLVMTypes::MaskType, maskOnes);
+    LLVMMaskAllOn = llvm::ConstantVector::get(maskOnes);

    std::vector<llvm::Constant *> maskZeros;
    llvm::Constant *offMask = NULL;
@@ -125,7 +125,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {

    for (int i = 0; i < target.vectorWidth; ++i)
        maskZeros.push_back(offMask);
-    LLVMMaskAllOff = llvm::ConstantVector::get(LLVMTypes::MaskType, maskZeros);
+    LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
 }


@@ -174,7 +174,7 @@ LLVMInt32Vector(int32_t ival) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -183,7 +183,7 @@ LLVMInt32Vector(const int32_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMInt32(ivec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -193,7 +193,7 @@ LLVMUInt32Vector(uint32_t ival) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -202,7 +202,7 @@ LLVMUInt32Vector(const uint32_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMUInt32(ivec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -212,7 +212,7 @@ LLVMFloatVector(float fval) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -221,7 +221,7 @@ LLVMFloatVector(const float *fvec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMFloat(fvec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -231,7 +231,7 @@ LLVMDoubleVector(double dval) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -240,7 +240,7 @@ LLVMDoubleVector(const double *dvec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMDouble(dvec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -250,7 +250,7 @@ LLVMInt64Vector(int64_t ival) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -259,7 +259,7 @@ LLVMInt64Vector(const int64_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMInt64(ivec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -269,7 +269,7 @@ LLVMUInt64Vector(uint64_t ival) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -278,7 +278,7 @@ LLVMUInt64Vector(const uint64_t *ivec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(LLVMUInt64(ivec[i]));
-    return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -297,7 +297,7 @@ LLVMBoolVector(bool b) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target.vectorWidth; ++i)
        vals.push_back(v);
-    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


@@ -317,7 +317,7 @@ LLVMBoolVector(const bool *bvec) {

        vals.push_back(v);
    }
-    return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals);
+    return llvm::ConstantVector::get(vals);
 }


--- a/main.cpp
+++ b/main.cpp
@@ -38,6 +38,7 @@
 #include "ispc.h"
 #include "module.h"
 #include <stdio.h>
+#include <stdlib.h>
 #include <llvm/Support/PrettyStackTrace.h>
 #ifdef LLVM_2_8
 #include <llvm/System/Signals.h>
--- a/module.cpp
+++ b/module.cpp
@@ -82,7 +82,9 @@
 #ifndef LLVM_2_8
 #include <llvm/Support/ToolOutputFile.h>
 #include <llvm/Support/Host.h>
-#endif // !LLVM_2_8
+#else // !LLVM_2_8
+#include <llvm/System/Host.h>
+#endif // LLVM_2_8
 #include <llvm/Assembly/PrintModulePass.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Bitcode/ReaderWriter.h>
@@ -184,7 +186,7 @@ Module::CompileFile() {
        FATAL("Need to implement code to run the preprocessor for windows"); 
 #else // ISPC_IS_WINDOWS
        char *cmd = NULL;
-        if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415936535 %s %s", 
+        if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415926536 %s %s", 
                     cppDefs.c_str(), filename ? filename : "-") == -1) {
            fprintf(stderr, "Unable to allocate memory in asprintf()?!\n");
            exit(1);
@@ -663,19 +665,12 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
                lCopyInTaskParameter(i, structParamPtr, decl, ctx);

        // Copy in the mask as well.
-        // FIXME: we may probably to check the mask at runtime and emit an
-        // 'all on' code path if it is all on, since that should be a
-        // common case.
-#if 1
        int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
        // The mask is the last parameter in the argument structure
        llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
                                                  "task_struct_mask");
        llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask");
        ctx->SetEntryMask(ptrval);
-#else
-        Warning(funSym->pos, "Running task with all-on mask to start.");
-#endif

        // Copy threadIndex and threadCount into stack-allocated storage so
        // that their symbols point to something reasonable.
--- a/opt.cpp
+++ b/opt.cpp
@@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
        }
        else if (maskAsInt == allOnMask) {
            // The mask is all on, so turn this into a regular store
-            const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0);
+            const llvm::Type *rvalueType = rvalue->getType();
+            const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
+            // Need to update this when int8/int16 are added
+            int align = (called == pms32Func || called == pms64Func ||
+                         called == msb32Func) ? 4 : 8;
+
            lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
            lCopyMetadata(lvalue, callInst);
-            llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue);
+            llvm::Instruction *store = 
+                new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
+                                    align);
            lCopyMetadata(store, callInst);
            llvm::ReplaceInstWithInst(callInst, store);

--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -513,14 +513,14 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
 define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                                           <8 x i32>) nounwind alwaysinline {
  %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
                                                   <8 x float> %newAsFloat,
                                                   <8 x float> %mask_as_float)
  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }

--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -278,15 +278,15 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa

 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
-  %val = load <4 x i32> * %0
+  %val = load <4 x i32> * %0, align 4
  %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) 
-  store <4 x i32> %newval, <4 x i32> * %0
+  store <4 x i32> %newval, <4 x i32> * %0, align 4
  ret void
 }

 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8

  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
  ; are actually bitcast <2 x i64> values
@@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }

--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -188,21 +188,21 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
 define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, 
                                     <4 x i32> %mask) nounwind alwaysinline {
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0
+  %oldValue = load <4 x i32>* %0, align 4
  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
                                                     <4 x float> %newAsFloat,
                                                     <4 x float> %mask_as_float)
  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
-  store <4 x i32> %blendAsInt, <4 x i32>* %0
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
  ret void
 }


 define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
                                     <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr
+  %oldValue = load <4 x i64>* %ptr, align 8
  %mask = bitcast <4 x i32> %i32mask to <4 x float>

  ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
  ; reconstruct the final <4 x i64> vector
  %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
                         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i64> %final, <4 x i64> * %ptr
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
  ret void
 }
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
                <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
                <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %oldValue = load <8 x i32>* %0
+  %oldValue = load <8 x i32>* %0, align 4
  %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
  %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
  %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
@@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
  %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
               <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
-  store <8 x i32> %blendAsInt, <8 x i32>* %0
+  store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
  ret void
 }

@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,

  %mask_as_float = bitcast <8 x i32> %mask to <8 x float>

-  %old = load <8 x i64>* %ptr
+  %old = load <8 x i64>* %ptr, align 8

  ; set up the first two 64-bit values
  %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
       <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
       <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i64> %final, <8 x i64> * %ptr
+  store <8 x i64> %final, <8 x i64> * %ptr, align 8
  ret void
 }

--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1087,8 +1087,8 @@ static inline float atan2(float y, float x) {
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
-        const float pi_vec = 3.1415927410125732421875;
-        const float pi_over_two_vec = 1.57079637050628662109375;
+        const float pi_vec = 3.1415926536;
+        const float pi_over_two_vec = 1.5707963267;
        // atan2(y, x) =
        //
        // atan2(y > 0, x = +-0) ->  Pi/2
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
  %ptr16 = bitcast [0 x i32] *%0 to i16 *
  %ptr = getelementptr i16 * %ptr16, i32 %offset
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
-  %val = load i`'eval(16*$1) * %ptr64, align 1
+  %val = load i`'eval(16*$1) * %ptr64, align 2

  %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
  ; unsigned, so use zero-extent...
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
  %oldmasked = and i`'eval(8*$1) %old, %notmask
  %newmasked = and i`'eval(8*$1) %val64, %mask64
  %final = or i`'eval(8*$1) %oldmasked, %newmasked
-  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64
+  store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1

  ret void
 }
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
  %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *

  ;; as above, use mask to do blending with logical ops...
-  %old = load i`'eval(16*$1) * %ptr64, align 1
+  %old = load i`'eval(16*$1) * %ptr64, align 2
  %oldmasked = and i`'eval(16*$1) %old, %notmask
  %newmasked = and i`'eval(16*$1) %val64, %mask64
  %final = or i`'eval(16*$1) %oldmasked, %newmasked
-  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64
+  store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2

  ret void
 }
@@ -544,7 +544,7 @@ all_on:
  ;; vector load
  %vecptr = bitcast i32 *%startptr to <$1 x i32> *
  %vec_load = load <$1 x i32> *%vecptr, align 4
-  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr
+  store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
  ret i32 $1

 not_all_on:
Author	SHA1	Message	Date
Matt Pharr	6cf4d7e216	Merge branch 'master' of /Users/mmp/git/ispc	2011-06-24 05:11:06 -07:00
Matt Pharr	865e430b56	Finished updating alignment issues for vector types; don't assume pointers are aligned to the natural vector width.	2011-06-23 18:51:15 -07:00
Matt Pharr	990bee5a86	Merge branch 'master' of github.com:ispc/ispc	2011-06-23 18:21:02 -07:00
Matt Pharr	b84167dddd	Fixed a number of issues related to memory alignment; a number of places were expecting vector-width-aligned pointers where in point of fact, there's no guarantee that they would have been in general. Removed the aligned memory allocation routines from some of the examples; they're no longer needed. No perf. difference on Core2/Core i5 CPUs; older CPUs may see some regressions. Still need to update the documentation for this change and finish reviewing alignment issues in Load/Store instructions generated by .cpp files.	2011-06-23 18:18:33 -07:00
Andreas Wendleder	f39d31174e	Follow LLVM API change.	2011-06-23 16:10:03 -07:00
Andreas Wendleder	39542f420a	Ignore built files.	2011-06-23 16:06:38 -07:00
Matt Pharr	d340dcbfcc	Modify makefile to print out llvm version and install directory it's using	2011-06-23 16:02:09 -07:00
Matt Pharr	e5bc6cd67c	Update examples/ Makefiles to make x86-64 explicit in compiler flags	2011-06-23 10:00:07 -07:00
Matt Pharr	40bd133dec	Add dependency to make sure that bison runs (to generate parse.hh) before we try to compile lex.cpp	2011-06-22 14:47:03 -07:00
Matt Pharr	2ced56736e	small comment changes, remove dead code	2011-06-22 14:38:49 -07:00
Matt Pharr	bf74a3360f	Merge pull request #24 from petecoup/master LLVM 2.8 mods	2011-06-22 12:03:19 -07:00
Matt Pharr	aaafdf80f2	Move two tests that are currently failing into failing_tests/	2011-06-22 05:28:23 -07:00
Matt Pharr	6086d3597c	Fix more instances of incorrect PI constants	2011-06-22 05:27:56 -07:00
ispc	a3fbb098ad	Merge pull request #1 from superoptimizer/master The value of pi used in Makefile was wrong in the 7th and 11th significant digits.	2011-06-22 05:18:32 -07:00
Mark Lacey	38d4ecccf4	Fix pi in two places 3.14159<2>653<6>.	2011-06-21 23:55:44 -07:00
Pete Couperus	af435e52c1	Minor mods to build on Fedora 15, LLVM 2.8	2011-06-21 22:57:36 -07:00
Matt Pharr	8b7522e98b	Always #define LLVM_2_9 on Windows builds	2011-06-21 15:02:44 -07:00
Matt Pharr	bffb380677	Rename readme file	2011-06-21 13:24:25 -07:00