18 Commits
v1.0 ... v1.0.1

Author SHA1 Message Date
Matt Pharr
6cf4d7e216 Merge branch 'master' of /Users/mmp/git/ispc 2011-06-24 05:11:06 -07:00
Matt Pharr
865e430b56 Finished updating alignment issues for vector types; don't assume pointers
are aligned to the natural vector width.
2011-06-23 18:51:15 -07:00
Matt Pharr
990bee5a86 Merge branch 'master' of github.com:ispc/ispc 2011-06-23 18:21:02 -07:00
Matt Pharr
b84167dddd Fixed a number of issues related to memory alignment; a number of places
were expecting vector-width-aligned pointers where in point of fact,
there's no guarantee that they would have been in general.

Removed the aligned memory allocation routines from some of the examples;
they're no longer needed.

No perf. difference on Core2/Core i5 CPUs; older CPUs may see some
regressions.

Still need to update the documentation for this change and finish reviewing
alignment issues in Load/Store instructions generated by .cpp files.
2011-06-23 18:18:33 -07:00
Andreas Wendleder
f39d31174e Follow LLVM API change. 2011-06-23 16:10:03 -07:00
Andreas Wendleder
39542f420a Ignore built files. 2011-06-23 16:06:38 -07:00
Matt Pharr
d340dcbfcc Modify makefile to print out llvm version and install directory it's using 2011-06-23 16:02:09 -07:00
Matt Pharr
e5bc6cd67c Update examples/ Makefiles to make x86-64 explicit in compiler flags 2011-06-23 10:00:07 -07:00
Matt Pharr
40bd133dec Add dependency to make sure that bison runs (to generate parse.hh) before we try to compile lex.cpp 2011-06-22 14:47:03 -07:00
Matt Pharr
2ced56736e small comment changes, remove dead code 2011-06-22 14:38:49 -07:00
Matt Pharr
bf74a3360f Merge pull request #24 from petecoup/master
LLVM 2.8 mods
2011-06-22 12:03:19 -07:00
Matt Pharr
aaafdf80f2 Move two tests that are currently failing into failing_tests/ 2011-06-22 05:28:23 -07:00
Matt Pharr
6086d3597c Fix more instances of incorrect PI constants 2011-06-22 05:27:56 -07:00
ispc
a3fbb098ad Merge pull request #1 from superoptimizer/master
The value of pi used in Makefile was wrong in the 7th and 11th significant digits.
2011-06-22 05:18:32 -07:00
Mark Lacey
38d4ecccf4 Fix pi in two places 3.14159<2>653<6>. 2011-06-21 23:55:44 -07:00
Pete Couperus
af435e52c1 Minor mods to build on Fedora 15, LLVM 2.8 2011-06-21 22:57:36 -07:00
Matt Pharr
8b7522e98b Always #define LLVM_2_9 on Windows builds 2011-06-21 15:02:44 -07:00
Matt Pharr
bffb380677 Rename readme file 2011-06-21 13:24:25 -07:00
33 changed files with 146 additions and 234 deletions

4
.gitignore vendored
View File

@@ -1,2 +1,6 @@
*.pyc *.pyc
*~ *~
depend
ispc
ispc_test
objs

View File

@@ -43,7 +43,7 @@ OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdli
default: ispc ispc_test default: ispc ispc_test
.PHONY: dirs clean depend doxygen .PHONY: dirs clean depend doxygen print_llvm_src
.PRECIOUS: objs/stdlib-%.cpp .PRECIOUS: objs/stdlib-%.cpp
depend: $(CXX_SRC) $(HEADERS) depend: $(CXX_SRC) $(HEADERS)
@@ -56,6 +56,9 @@ dirs:
@echo Creating objs/ directory @echo Creating objs/ directory
@/bin/mkdir -p objs @/bin/mkdir -p objs
print_llvm_src:
@echo Using LLVM `llvm-config --version` from `llvm-config --libdir`
clean: clean:
/bin/rm -rf objs ispc ispc_test /bin/rm -rf objs ispc ispc_test
@@ -63,7 +66,7 @@ doxygen:
/bin/rm -rf docs/doxygen /bin/rm -rf docs/doxygen
doxygen doxygen.cfg doxygen doxygen.cfg
ispc: dirs $(OBJS) ispc: print_llvm_src dirs $(OBJS)
@echo Creating ispc executable @echo Creating ispc executable
@$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS) @$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS)
@@ -87,7 +90,7 @@ objs/lex.cpp: lex.ll
@echo Running flex on $< @echo Running flex on $<
@$(LEX) -o $@ $< @$(LEX) -o $@ $<
objs/lex.o: objs/lex.cpp $(HEADERS) objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
@echo Compiling $< @echo Compiling $<
@$(CXX) $(CXXFLAGS) -o $@ -c $< @$(CXX) $(CXXFLAGS) -o $@ -c $<
@@ -111,7 +114,7 @@ objs/stdlib-c.o: objs/stdlib-c.cpp
objs/stdlib_ispc.cpp: stdlib.ispc objs/stdlib_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< @echo Creating C++ source from $<
@$(CPP) -DISPC=1 -DPI=3.1415936535 $< | ./stdlib2cpp.py > $@ @$(CPP) -DISPC=1 -DPI=3.1415926536 $< | ./stdlib2cpp.py > $@
objs/stdlib_ispc.o: objs/stdlib_ispc.cpp objs/stdlib_ispc.o: objs/stdlib_ispc.cpp
@echo Compiling $< @echo Compiling $<

40
ctx.cpp
View File

@@ -1315,8 +1315,21 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type,
if (llvm::isa<const llvm::PointerType>(lvalue->getType())) { if (llvm::isa<const llvm::PointerType>(lvalue->getType())) {
// If the lvalue is a straight up regular pointer, then just issue // If the lvalue is a straight up regular pointer, then just issue
// a regular load // a regular load. First figure out the alignment; in general we
llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock); // can just assume the natural alignment (0 here), but for varying
// atomic types, we need to make sure that the compiler emits
// unaligned vector loads, so we specify a reduced alignment here.
int align = 0;
const AtomicType *atomicType = dynamic_cast<const AtomicType *>(type);
if (atomicType != NULL && atomicType->IsVaryingType())
// We actually just want to align to the vector element
// alignment, but can't easily get that here, so just tell LLVM
// it's totally unaligned. (This shouldn't make any difference
// vs the proper alignment in practice.)
align = 1;
llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load",
false /* not volatile */,
align, bblock);
AddDebugPos(inst); AddDebugPos(inst);
return inst; return inst;
} }
@@ -1437,7 +1450,7 @@ FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
llvm::Value *line = LLVMInt32(pos.first_line); llvm::Value *line = LLVMInt32(pos.first_line);
#ifdef LLVM_2_8 #ifdef LLVM_2_8
md = llvm::MDNode::get(*g->ctx, &first_line, 1); md = llvm::MDNode::get(*g->ctx, &line, 1);
#else #else
md = llvm::MDNode::get(*g->ctx, line); md = llvm::MDNode::get(*g->ctx, line);
#endif #endif
@@ -1445,7 +1458,7 @@ FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) {
llvm::Value *column = LLVMInt32(pos.first_column); llvm::Value *column = LLVMInt32(pos.first_column);
#ifdef LLVM_2_8 #ifdef LLVM_2_8
md = llvm::MDNode::get(*g->ctx, &first_column, 1); md = llvm::MDNode::get(*g->ctx, &column, 1);
#else #else
md = llvm::MDNode::get(*g->ctx, column); md = llvm::MDNode::get(*g->ctx, column);
#endif #endif
@@ -1644,7 +1657,16 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
return; return;
} }
llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock); llvm::Instruction *inst;
if (llvm::isa<llvm::VectorType>(rvalue->getType()))
// Specify an unaligned store, since we don't know that the lvalue
// will in fact be aligned to a vector width here. (Actually
// should be aligned to the alignment of the vector elment type...)
inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
1, bblock);
else
inst = new llvm::StoreInst(rvalue, lvalue, bblock);
AddDebugPos(inst); AddDebugPos(inst);
} }
@@ -1661,8 +1683,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
// Figure out what kind of store we're doing here // Figure out what kind of store we're doing here
if (rvalueType->IsUniformType()) { if (rvalueType->IsUniformType()) {
// The easy case; a regular store // The easy case; a regular store, natural alignment is fine
llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock); llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock);
AddDebugPos(si); AddDebugPos(si);
} }
else if (llvm::isa<const llvm::ArrayType>(lvalue->getType())) else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
@@ -1672,9 +1694,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
else if (storeMask == LLVMMaskAllOn) { else if (storeMask == LLVMMaskAllOn) {
// Otherwise it is a masked store unless we can determine that the // Otherwise it is a masked store unless we can determine that the
// mask is all on... // mask is all on...
llvm::Instruction *si = StoreInst(rvalue, lvalue, name);
new llvm::StoreInst(rvalue, lvalue, name, bblock);
AddDebugPos(si);
} }
else else
maskedStore(rvalue, lvalue, rvalueType, storeMask); maskedStore(rvalue, lvalue, rvalueType, storeMask);

View File

@@ -1970,7 +1970,7 @@ Data Layout
In general, ``ispc`` tries to ensure that ``struct`` s and other complex In general, ``ispc`` tries to ensure that ``struct`` s and other complex
datatypes are laid out in the same way in memory as they are in C/C++. datatypes are laid out in the same way in memory as they are in C/C++.
Matching alignment is important for easy interoperability between C/C++ Matching structure layout is important for easy interoperability between C/C++
code and ``ispc`` code. code and ``ispc`` code.
The main complexity in sharing data between ``ispc`` and C/C++ often comes The main complexity in sharing data between ``ispc`` and C/C++ often comes
@@ -2023,11 +2023,6 @@ It can pass ``array`` to a ``ispc`` function defined as:
export void foo(uniform float array[], uniform int count) export void foo(uniform float array[], uniform int count)
(Though the pointer must be aligned to the compilation target's natural
vector width; see the discussion of alignment restrictions in `Data
Alignment and Aliasing`_ and the aligned allocation routines in
``examples/options/options.cpp`` for example.)
Similarly, ``struct`` s from the application can have embedded pointers. Similarly, ``struct`` s from the application can have embedded pointers.
This is handled with similar ``[]`` syntax: This is handled with similar ``[]`` syntax:
@@ -2062,55 +2057,20 @@ vector types from C/C++ application code if possible.
Data Alignment and Aliasing Data Alignment and Aliasing
--------------------------- ---------------------------
There are two important constraints that must be adhered to when passing There are are two important constraints that must be adhered to when
pointers from the application to ``ispc`` programs. passing pointers from the application to ``ispc`` programs.
The first constraint is alignment: any pointers from the host program that The first is that it is required that it be valid to read memory at the
are passed to ``ispc`` must be aligned to natural vector alignment of first element of any array that is passed to ``ispc``. In practice, this
system--for example, 16 byte alignment on a target that supports Intel® should just happen naturally, but it does mean that it is illegal to pass a
SSE, 32-byte on an Intel® AVX target. If this constraint isn't met, the ``NULL`` pointer as a parameter to a ``ispc`` function called from the
program may abort at runtime with an unaligned memory access error. application.
For example, in a ``ispc`` function with the following declaration: The second constraint is that pointers and references in ``ispc`` programs
must not alias. The ``ispc`` compiler assumes that different pointers
:: can't end up pointing to the same memory location, either due to having the
same initial value, or through array indexing in the program as it
export void foo(uniform float in[], uniform float out[], executed.
int count);
If the application is passing stack-allocated arrays for ``in`` and
``out``, these C/C++ compiler must be told to align these arrays.
::
// MSVC, SSE target
__declspec(align(16)) float in[16], out[16];
foo(in, out, 16);
With the gcc/clang compilers, the syntax for providing alignment is
slightly different:
::
float x[16] __attribute__ ((__align__(16)));
foo(in, out, 16);
If the data being passed is dynamically allocated, the appropriate system
aligned memory allocation routine should be used to allocate it (for
example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with
Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for
an example.)
It is also required that it be valid to read memory at the first element of
any array that is passed to ``ispc``. In practice, this should just
happen naturally, but it does mean that it is illegal to pass a ``NULL``
pointer as a parameter to a ``ispc`` function called from the application.
The second key constraint is that pointers and references in ``ispc``
programs must not alias. The ``ispc`` compiler assumes that different
pointers can't end up pointing to the same memory location, either due to
having the same initial value, or through array indexing in the program as
it executed.
This aliasing constraint also applies to ``reference`` parameters to This aliasing constraint also applies to ``reference`` parameters to
functions. Given a function like: functions. Given a function like:
@@ -2127,8 +2087,8 @@ another case of aliasing, and if the caller calls the function as ``func(x,
x)``, it's not guaranteed that the ``if`` test will evaluate to true, due x)``, it's not guaranteed that the ``if`` test will evaluate to true, due
to the compiler's requirement of no aliasing. to the compiler's requirement of no aliasing.
(In the future, ``ispc`` will have the ability to work with unaligned (In the future, ``ispc`` will have a mechanism to indicate that pointers
memory as well as have a mechanism to indicate that pointers may alias.) may alias.)
Using ISPC Effectively Using ISPC Effectively
====================== ======================

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -O3 -Wall CXXFLAGS=-Iobjs/ -O3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --fast-math ISPCFLAGS=-O2 --fast-math --arch=x86-64
default: ao default: ao

View File

@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
} }
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 4) { if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
} }
// Allocate space for output images // Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3); img = new unsigned char[width * height * 3];
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); fimg = new float[width * height * 3];
// //
// Run the ispc path, test_iterations times, and report the minimum // Run the ispc path, test_iterations times, and report the minimum

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -g3 -Wall CXXFLAGS=-Iobjs/ -g3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --fast-math --instrument ISPCFLAGS=-O2 --fast-math --instrument --arch=x86-64
default: ao default: ao

View File

@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
} }
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 4) { if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
} }
// Allocate space for output images // Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3); img = new unsigned char[width * height * 3];
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); fimg = new float[width * height * 3];
ao_ispc(width, height, NSUBSAMPLES, fimg); ao_ispc(width, height, NSUBSAMPLES, fimg);

3
examples/mandelbrot/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
mandelbrot
*.ppm
objs

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -O3 -Wall CXXFLAGS=-Iobjs/ -O3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --target=sse4x2 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
default: mandelbrot default: mandelbrot

View File

@@ -11,10 +11,10 @@ endif
TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o)) TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o))
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -O3 -Wall CXXFLAGS=-Iobjs/ -O3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --target=sse4x2 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
default: mandelbrot default: mandelbrot

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -g -Wall CXXFLAGS=-Iobjs/ -g -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --target=sse4x2 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
default: options default: options

View File

@@ -37,9 +37,6 @@
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <algorithm> #include <algorithm>
#ifndef __APPLE__
#include <malloc.h>
#endif // !__APPLE__
using std::max; using std::max;
#include "options_defs.h" #include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
#include "options_ispc.h" #include "options_ispc.h"
using namespace ispc; using namespace ispc;
// Allocate memory with 64-byte alignment.
float *AllocFloats(int count) {
int size = count * sizeof(float);
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count); float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
int main() { int main() {
// Pointers passed to ispc code must have alignment of the target's // Pointers passed to ispc code must have alignment of the target's
// vector width at minimum. // vector width at minimum.
float *S = AllocFloats(N_OPTIONS); float *S = new float[N_OPTIONS];
float *X = AllocFloats(N_OPTIONS); float *X = new float[N_OPTIONS];
float *T = AllocFloats(N_OPTIONS); float *T = new float[N_OPTIONS];
float *r = AllocFloats(N_OPTIONS); float *r = new float[N_OPTIONS];
float *v = AllocFloats(N_OPTIONS); float *v = new float[N_OPTIONS];
float *result = AllocFloats(N_OPTIONS); float *result = new float[N_OPTIONS];
for (int i = 0; i < N_OPTIONS; ++i) { for (int i = 0; i < N_OPTIONS; ++i) {
S[i] = 100; // stock price S[i] = 100; // stock price

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -O3 -Wall CXXFLAGS=-Iobjs/ -O3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 --target=sse4x2 ISPCFLAGS=-O2 --target=sse4x2 --arch=x86-64
default: rt default: rt

View File

@@ -43,9 +43,6 @@
#include <algorithm> #include <algorithm>
#include <assert.h> #include <assert.h>
#include <sys/types.h> #include <sys/types.h>
#ifndef __APPLE__
#include <malloc.h>
#endif
#include "../timing.h" #include "../timing.h"
#include "rt_ispc.h" #include "rt_ispc.h"
@@ -53,23 +50,6 @@ using namespace ispc;
typedef unsigned int uint; typedef unsigned int uint;
template <typename T>
T *AllocAligned(int count) {
int size = count * sizeof(T);
#if defined(_WIN32) || defined(_WIN64)
return (T *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (T *)amem;
#else
return (T *)memalign(64, size);
#endif
}
extern void raytrace_serial(int width, int height, const float raster2camera[4][4], extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
const float camera2world[4][4], float image[], const float camera2world[4][4], float image[],
int id[], const LinearBVHNode nodes[], int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
uint nNodes; uint nNodes;
READ(nNodes, 1); READ(nNodes, 1);
LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes); LinearBVHNode *nodes = new LinearBVHNode[nNodes];
for (unsigned int i = 0; i < nNodes; ++i) { for (unsigned int i = 0; i < nNodes; ++i) {
// Each node is 6x floats for a boox, then an integer for an offset // Each node is 6x floats for a boox, then an integer for an offset
// to the second child node, then an integer that encodes the type // to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
// And then read the triangles // And then read the triangles
uint nTris; uint nTris;
READ(nTris, 1); READ(nTris, 1);
Triangle *triangles = AllocAligned<Triangle>(nTris); Triangle *triangles = new Triangle[nTris];
for (uint i = 0; i < nTris; ++i) { for (uint i = 0; i < nTris; ++i) {
// 9x floats for the 3 vertices // 9x floats for the 3 vertices
float v[9]; float v[9];

2
examples/simple/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
simple
objs

View File

@@ -1,8 +1,8 @@
CXX=g++ CXX=g++ -m64
CXXFLAGS=-Iobjs/ -O3 -Wall CXXFLAGS=-Iobjs/ -O3 -Wall
ISPC=ispc ISPC=ispc
ISPCFLAGS=-O2 ISPCFLAGS=-O2 --arch=x86-64
default: simple default: simple

View File

@@ -38,15 +38,7 @@
using namespace ispc; using namespace ispc;
int main() { int main() {
// Pointers passed to ispc-compiled code are currently required to have float vin[16], vout[16];
// alignment equal to the target's native vector size. Here we align
// to 32 bytes to be safe for both SSE and AVX targets.
#ifdef _MSC_VER
__declspec(align(32)) float vin[16], vout[16];
#else
float vin[16] __attribute__((aligned(32)));
float vout[16] __attribute__((aligned(32)));
#endif
// Initialize input buffer // Initialize input buffer
for (int i = 0; i < 16; ++i) for (int i = 0; i < 16; ++i)

View File

@@ -380,7 +380,7 @@ lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (unsigned int i = 0; i < lvt->getNumElements(); ++i) for (unsigned int i = 0; i < lvt->getNumElements(); ++i)
vals.push_back(constElement); vals.push_back(constElement);
return llvm::ConstantVector::get(lvt, vals); return llvm::ConstantVector::get(vals);
} }
else { else {
const llvm::ArrayType *lat = const llvm::ArrayType *lat =
@@ -3203,6 +3203,7 @@ ConstExpr::ConstExpr(ConstExpr *old, double *v)
break; break;
case AtomicType::TYPE_INT64: case AtomicType::TYPE_INT64:
case AtomicType::TYPE_UINT64: case AtomicType::TYPE_UINT64:
// For now, this should never be reached
FATAL("fixme; we need another constructor so that we're not trying to pass " FATAL("fixme; we need another constructor so that we're not trying to pass "
"double values to init an int64 type..."); "double values to init an int64 type...");
default: default:

View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations"> <ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32"> <ProjectConfiguration Include="Debug|Win32">
@@ -179,7 +179,7 @@
<PrecompiledHeader>NotUsing</PrecompiledHeader> <PrecompiledHeader>NotUsing</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel> <WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization> <Optimization>Disabled</Optimization>
<PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions> <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings> <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
</ClCompile> </ClCompile>
@@ -197,7 +197,7 @@
<Optimization>MaxSpeed</Optimization> <Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking> <FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions> <IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>NOMINMAX</PreprocessorDefinitions> <PreprocessorDefinitions>NOMINMAX;LLVM_2_9</PreprocessorDefinitions>
<AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>$(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
<DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings> <DisableSpecificWarnings>4146;4800;4996;4355;4624</DisableSpecificWarnings>
</ClCompile> </ClCompile>

View File

@@ -116,7 +116,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
for (int i = 0; i < target.vectorWidth; ++i) for (int i = 0; i < target.vectorWidth; ++i)
maskOnes.push_back(onMask); maskOnes.push_back(onMask);
LLVMMaskAllOn = llvm::ConstantVector::get(LLVMTypes::MaskType, maskOnes); LLVMMaskAllOn = llvm::ConstantVector::get(maskOnes);
std::vector<llvm::Constant *> maskZeros; std::vector<llvm::Constant *> maskZeros;
llvm::Constant *offMask = NULL; llvm::Constant *offMask = NULL;
@@ -125,7 +125,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
for (int i = 0; i < target.vectorWidth; ++i) for (int i = 0; i < target.vectorWidth; ++i)
maskZeros.push_back(offMask); maskZeros.push_back(offMask);
LLVMMaskAllOff = llvm::ConstantVector::get(LLVMTypes::MaskType, maskZeros); LLVMMaskAllOff = llvm::ConstantVector::get(maskZeros);
} }
@@ -174,7 +174,7 @@ LLVMInt32Vector(int32_t ival) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -183,7 +183,7 @@ LLVMInt32Vector(const int32_t *ivec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMInt32(ivec[i])); vals.push_back(LLVMInt32(ivec[i]));
return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -193,7 +193,7 @@ LLVMUInt32Vector(uint32_t ival) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -202,7 +202,7 @@ LLVMUInt32Vector(const uint32_t *ivec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMUInt32(ivec[i])); vals.push_back(LLVMUInt32(ivec[i]));
return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -212,7 +212,7 @@ LLVMFloatVector(float fval) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -221,7 +221,7 @@ LLVMFloatVector(const float *fvec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMFloat(fvec[i])); vals.push_back(LLVMFloat(fvec[i]));
return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -231,7 +231,7 @@ LLVMDoubleVector(double dval) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -240,7 +240,7 @@ LLVMDoubleVector(const double *dvec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMDouble(dvec[i])); vals.push_back(LLVMDouble(dvec[i]));
return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -250,7 +250,7 @@ LLVMInt64Vector(int64_t ival) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -259,7 +259,7 @@ LLVMInt64Vector(const int64_t *ivec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMInt64(ivec[i])); vals.push_back(LLVMInt64(ivec[i]));
return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -269,7 +269,7 @@ LLVMUInt64Vector(uint64_t ival) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -278,7 +278,7 @@ LLVMUInt64Vector(const uint64_t *ivec) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(LLVMUInt64(ivec[i])); vals.push_back(LLVMUInt64(ivec[i]));
return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -297,7 +297,7 @@ LLVMBoolVector(bool b) {
std::vector<llvm::Constant *> vals; std::vector<llvm::Constant *> vals;
for (int i = 0; i < g->target.vectorWidth; ++i) for (int i = 0; i < g->target.vectorWidth; ++i)
vals.push_back(v); vals.push_back(v);
return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals); return llvm::ConstantVector::get(vals);
} }
@@ -317,7 +317,7 @@ LLVMBoolVector(const bool *bvec) {
vals.push_back(v); vals.push_back(v);
} }
return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals); return llvm::ConstantVector::get(vals);
} }

View File

@@ -38,6 +38,7 @@
#include "ispc.h" #include "ispc.h"
#include "module.h" #include "module.h"
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <llvm/Support/PrettyStackTrace.h> #include <llvm/Support/PrettyStackTrace.h>
#ifdef LLVM_2_8 #ifdef LLVM_2_8
#include <llvm/System/Signals.h> #include <llvm/System/Signals.h>

View File

@@ -82,7 +82,9 @@
#ifndef LLVM_2_8 #ifndef LLVM_2_8
#include <llvm/Support/ToolOutputFile.h> #include <llvm/Support/ToolOutputFile.h>
#include <llvm/Support/Host.h> #include <llvm/Support/Host.h>
#endif // !LLVM_2_8 #else // !LLVM_2_8
#include <llvm/System/Host.h>
#endif // LLVM_2_8
#include <llvm/Assembly/PrintModulePass.h> #include <llvm/Assembly/PrintModulePass.h>
#include <llvm/Support/raw_ostream.h> #include <llvm/Support/raw_ostream.h>
#include <llvm/Bitcode/ReaderWriter.h> #include <llvm/Bitcode/ReaderWriter.h>
@@ -184,7 +186,7 @@ Module::CompileFile() {
FATAL("Need to implement code to run the preprocessor for windows"); FATAL("Need to implement code to run the preprocessor for windows");
#else // ISPC_IS_WINDOWS #else // ISPC_IS_WINDOWS
char *cmd = NULL; char *cmd = NULL;
if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415936535 %s %s", if (asprintf(&cmd, "/usr/bin/cpp -DISPC=1 -DPI=3.1415926536 %s %s",
cppDefs.c_str(), filename ? filename : "-") == -1) { cppDefs.c_str(), filename ? filename : "-") == -1) {
fprintf(stderr, "Unable to allocate memory in asprintf()?!\n"); fprintf(stderr, "Unable to allocate memory in asprintf()?!\n");
exit(1); exit(1);
@@ -663,19 +665,12 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function,
lCopyInTaskParameter(i, structParamPtr, decl, ctx); lCopyInTaskParameter(i, structParamPtr, decl, ctx);
// Copy in the mask as well. // Copy in the mask as well.
// FIXME: we may probably to check the mask at runtime and emit an
// 'all on' code path if it is all on, since that should be a
// common case.
#if 1
int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0; int nArgs = decl->functionArgs ? decl->functionArgs->size() : 0;
// The mask is the last parameter in the argument structure // The mask is the last parameter in the argument structure
llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs, llvm::Value *ptr = ctx->GetElementPtrInst(structParamPtr, 0, nArgs,
"task_struct_mask"); "task_struct_mask");
llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask"); llvm::Value *ptrval = ctx->LoadInst(ptr, NULL, "mask");
ctx->SetEntryMask(ptrval); ctx->SetEntryMask(ptrval);
#else
Warning(funSym->pos, "Running task with all-on mask to start.");
#endif
// Copy threadIndex and threadCount into stack-allocated storage so // Copy threadIndex and threadCount into stack-allocated storage so
// that their symbols point to something reasonable. // that their symbols point to something reasonable.

11
opt.cpp
View File

@@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
} }
else if (maskAsInt == allOnMask) { else if (maskAsInt == allOnMask) {
// The mask is all on, so turn this into a regular store // The mask is all on, so turn this into a regular store
const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0); const llvm::Type *rvalueType = rvalue->getType();
const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
// Need to update this when int8/int16 are added
int align = (called == pms32Func || called == pms64Func ||
called == msb32Func) ? 4 : 8;
lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst); lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
lCopyMetadata(lvalue, callInst); lCopyMetadata(lvalue, callInst);
llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue); llvm::Instruction *store =
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
align);
lCopyMetadata(store, callInst); lCopyMetadata(store, callInst);
llvm::ReplaceInstWithInst(callInst, store); llvm::ReplaceInstWithInst(callInst, store);

View File

@@ -513,14 +513,14 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline { <8 x i32>) nounwind alwaysinline {
%mask_as_float = bitcast <8 x i32> %2 to <8 x float> %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
%oldValue = load <8 x i32>* %0 %oldValue = load <8 x i32>* %0, align 4
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
%newAsFloat = bitcast <8 x i32> %1 to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
%blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat, %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
<8 x float> %newAsFloat, <8 x float> %newAsFloat,
<8 x float> %mask_as_float) <8 x float> %mask_as_float)
%blendAsInt = bitcast <8 x float> %blend to <8 x i32> %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
store <8 x i32> %blendAsInt, <8 x i32>* %0 store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
ret void ret void
} }

View File

@@ -278,15 +278,15 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%val = load <4 x i32> * %0 %val = load <4 x i32> * %0, align 4
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
store <4 x i32> %newval, <4 x i32> * %0 store <4 x i32> %newval, <4 x i32> * %0, align 4
ret void ret void
} }
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr %oldValue = load <4 x i64>* %ptr, align 8
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
; are actually bitcast <2 x i64> values ; are actually bitcast <2 x i64> values
@@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
; reconstruct the final <4 x i64> vector ; reconstruct the final <4 x i64> vector
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23, %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i64> %final, <4 x i64> * %ptr store <4 x i64> %final, <4 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -188,21 +188,21 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%mask_as_float = bitcast <4 x i32> %mask to <4 x float> %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
%oldValue = load <4 x i32>* %0 %oldValue = load <4 x i32>* %0, align 4
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
%newAsFloat = bitcast <4 x i32> %1 to <4 x float> %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
<4 x float> %newAsFloat, <4 x float> %newAsFloat,
<4 x float> %mask_as_float) <4 x float> %mask_as_float)
%blendAsInt = bitcast <4 x float> %blend to <4 x i32> %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
store <4 x i32> %blendAsInt, <4 x i32>* %0 store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
ret void ret void
} }
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %i32mask) nounwind alwaysinline { <4 x i32> %i32mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr %oldValue = load <4 x i64>* %ptr, align 8
%mask = bitcast <4 x i32> %i32mask to <4 x float> %mask = bitcast <4 x i32> %i32mask to <4 x float>
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
; reconstruct the final <4 x i64> vector ; reconstruct the final <4 x i64> vector
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23, %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i64> %final, <4 x i64> * %ptr store <4 x i64> %final, <4 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef, %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7> <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%oldValue = load <8 x i32>* %0 %oldValue = load <8 x i32>* %0, align 4
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
%newAsFloat = bitcast <8 x i32> %1 to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
%old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef, %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
@@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
%blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b, %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%blendAsInt = bitcast <8 x float> %blend to <8 x i32> %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
store <8 x i32> %blendAsInt, <8 x i32>* %0 store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
ret void ret void
} }
@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
%mask_as_float = bitcast <8 x i32> %mask to <8 x float> %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
%old = load <8 x i64>* %ptr %old = load <8 x i64>* %ptr, align 8
; set up the first two 64-bit values ; set up the first two 64-bit values
%old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1> %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567, %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i64> %final, <8 x i64> * %ptr store <8 x i64> %final, <8 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -1087,8 +1087,8 @@ static inline float atan2(float y, float x) {
} }
else if (__math_lib == __math_lib_ispc || else if (__math_lib == __math_lib_ispc ||
__math_lib == __math_lib_ispc_fast) { __math_lib == __math_lib_ispc_fast) {
const float pi_vec = 3.1415927410125732421875; const float pi_vec = 3.1415926536;
const float pi_over_two_vec = 1.57079637050628662109375; const float pi_over_two_vec = 1.5707963267;
// atan2(y, x) = // atan2(y, x) =
// //
// atan2(y > 0, x = +-0) -> Pi/2 // atan2(y > 0, x = +-0) -> Pi/2

View File

@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
%ptr16 = bitcast [0 x i32] *%0 to i16 * %ptr16 = bitcast [0 x i32] *%0 to i16 *
%ptr = getelementptr i16 * %ptr16, i32 %offset %ptr = getelementptr i16 * %ptr16, i32 %offset
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
%val = load i`'eval(16*$1) * %ptr64, align 1 %val = load i`'eval(16*$1) * %ptr64, align 2
%vval = bitcast i`'eval(16*$1) %val to <$1 x i16> %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
; unsigned, so use zero-extent... ; unsigned, so use zero-extent...
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
%oldmasked = and i`'eval(8*$1) %old, %notmask %oldmasked = and i`'eval(8*$1) %old, %notmask
%newmasked = and i`'eval(8*$1) %val64, %mask64 %newmasked = and i`'eval(8*$1) %val64, %mask64
%final = or i`'eval(8*$1) %oldmasked, %newmasked %final = or i`'eval(8*$1) %oldmasked, %newmasked
store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64 store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
ret void ret void
} }
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
;; as above, use mask to do blending with logical ops... ;; as above, use mask to do blending with logical ops...
%old = load i`'eval(16*$1) * %ptr64, align 1 %old = load i`'eval(16*$1) * %ptr64, align 2
%oldmasked = and i`'eval(16*$1) %old, %notmask %oldmasked = and i`'eval(16*$1) %old, %notmask
%newmasked = and i`'eval(16*$1) %val64, %mask64 %newmasked = and i`'eval(16*$1) %val64, %mask64
%final = or i`'eval(16*$1) %oldmasked, %newmasked %final = or i`'eval(16*$1) %oldmasked, %newmasked
store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64 store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
ret void ret void
} }
@@ -544,7 +544,7 @@ all_on:
;; vector load ;; vector load
%vecptr = bitcast i32 *%startptr to <$1 x i32> * %vecptr = bitcast i32 *%startptr to <$1 x i32> *
%vec_load = load <$1 x i32> *%vecptr, align 4 %vec_load = load <$1 x i32> *%vecptr, align 4
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
ret i32 $1 ret i32 $1
not_all_on: not_all_on: