Fixed a number of issues related to memory alignment; a number of places
were expecting vector-width-aligned pointers where in point of fact, there's no guarantee that they would have been in general. Removed the aligned memory allocation routines from some of the examples; they're no longer needed. No perf. difference on Core2/Core i5 CPUs; older CPUs may see some regressions. Still need to update the documentation for this change and finish reviewing alignment issues in Load/Store instructions generated by .cpp files.
This commit is contained in:
@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory with 64-byte alignment.
|
||||
float *
|
||||
AllocAligned(int size) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return (float *)_aligned_malloc(size, 64);
|
||||
#elif defined (__APPLE__)
|
||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
||||
((void**)amem)[-1] = mem;
|
||||
return (float *)amem;
|
||||
#else
|
||||
return (float *)memalign(64, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// Allocate space for output images
|
||||
img = (unsigned char *)AllocAligned(width * height * 3);
|
||||
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
//
|
||||
// Run the ispc path, test_iterations times, and report the minimum
|
||||
|
||||
@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
|
||||
}
|
||||
|
||||
|
||||
// Allocate memory with 64-byte alignment.
|
||||
float *
|
||||
AllocAligned(int size) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return (float *)_aligned_malloc(size, 64);
|
||||
#elif defined (__APPLE__)
|
||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
||||
((void**)amem)[-1] = mem;
|
||||
return (float *)amem;
|
||||
#else
|
||||
return (float *)memalign(64, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
if (argc != 4) {
|
||||
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
|
||||
}
|
||||
|
||||
// Allocate space for output images
|
||||
img = (unsigned char *)AllocAligned(width * height * 3);
|
||||
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
|
||||
img = new unsigned char[width * height * 3];
|
||||
fimg = new float[width * height * 3];
|
||||
|
||||
ao_ispc(width, height, NSUBSAMPLES, fimg);
|
||||
|
||||
|
||||
@@ -37,9 +37,6 @@
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <algorithm>
|
||||
#ifndef __APPLE__
|
||||
#include <malloc.h>
|
||||
#endif // !__APPLE__
|
||||
using std::max;
|
||||
|
||||
#include "options_defs.h"
|
||||
@@ -48,23 +45,6 @@ using std::max;
|
||||
#include "options_ispc.h"
|
||||
using namespace ispc;
|
||||
|
||||
// Allocate memory with 64-byte alignment.
|
||||
float *AllocFloats(int count) {
|
||||
int size = count * sizeof(float);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return (float *)_aligned_malloc(size, 64);
|
||||
#elif defined (__APPLE__)
|
||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
||||
((void**)amem)[-1] = mem;
|
||||
return (float *)amem;
|
||||
#else
|
||||
return (float *)memalign(64, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
|
||||
float ra[], float va[],
|
||||
float result[], int count);
|
||||
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
|
||||
int main() {
|
||||
// Pointers passed to ispc code must have alignment of the target's
|
||||
// vector width at minimum.
|
||||
float *S = AllocFloats(N_OPTIONS);
|
||||
float *X = AllocFloats(N_OPTIONS);
|
||||
float *T = AllocFloats(N_OPTIONS);
|
||||
float *r = AllocFloats(N_OPTIONS);
|
||||
float *v = AllocFloats(N_OPTIONS);
|
||||
float *result = AllocFloats(N_OPTIONS);
|
||||
float *S = new float[N_OPTIONS];
|
||||
float *X = new float[N_OPTIONS];
|
||||
float *T = new float[N_OPTIONS];
|
||||
float *r = new float[N_OPTIONS];
|
||||
float *v = new float[N_OPTIONS];
|
||||
float *result = new float[N_OPTIONS];
|
||||
|
||||
for (int i = 0; i < N_OPTIONS; ++i) {
|
||||
S[i] = 100; // stock price
|
||||
|
||||
@@ -43,9 +43,6 @@
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
#include <sys/types.h>
|
||||
#ifndef __APPLE__
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
#include "../timing.h"
|
||||
#include "rt_ispc.h"
|
||||
|
||||
@@ -53,23 +50,6 @@ using namespace ispc;
|
||||
|
||||
typedef unsigned int uint;
|
||||
|
||||
template <typename T>
|
||||
T *AllocAligned(int count) {
|
||||
int size = count * sizeof(T);
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return (T *)_aligned_malloc(size, 64);
|
||||
#elif defined (__APPLE__)
|
||||
// Allocate excess memory to ensure an aligned pointer can be returned
|
||||
void *mem = malloc(size + (64-1) + sizeof(void*));
|
||||
char *amem = ((char*)mem) + sizeof(void*);
|
||||
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
|
||||
((void**)amem)[-1] = mem;
|
||||
return (T *)amem;
|
||||
#else
|
||||
return (T *)memalign(64, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
|
||||
const float camera2world[4][4], float image[],
|
||||
int id[], const LinearBVHNode nodes[],
|
||||
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
|
||||
uint nNodes;
|
||||
READ(nNodes, 1);
|
||||
|
||||
LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
|
||||
LinearBVHNode *nodes = new LinearBVHNode[nNodes];
|
||||
for (unsigned int i = 0; i < nNodes; ++i) {
|
||||
// Each node is 6x floats for a boox, then an integer for an offset
|
||||
// to the second child node, then an integer that encodes the type
|
||||
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
|
||||
// And then read the triangles
|
||||
uint nTris;
|
||||
READ(nTris, 1);
|
||||
Triangle *triangles = AllocAligned<Triangle>(nTris);
|
||||
Triangle *triangles = new Triangle[nTris];
|
||||
for (uint i = 0; i < nTris; ++i) {
|
||||
// 9x floats for the 3 vertices
|
||||
float v[9];
|
||||
|
||||
Reference in New Issue
Block a user