Fixed a number of issues related to memory alignment; a number of places

were expecting vector-width-aligned pointers where in point of fact,
there's no guarantee that they would have been in general.

Removed the aligned memory allocation routines from some of the examples;
they're no longer needed.

No perf. difference on Core2/Core i5 CPUs; older CPUs may see some
regressions.

Still need to update the documentation for this change and finish reviewing
alignment issues in Load/Store instructions generated by .cpp files.
This commit is contained in:
Matt Pharr
2011-06-23 18:18:33 -07:00
parent d340dcbfcc
commit b84167dddd
11 changed files with 45 additions and 112 deletions

View File

@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
}
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv)
{
if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
}
// Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3);
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
img = new unsigned char[width * height * 3];
fimg = new float[width * height * 3];
//
// Run the ispc path, test_iterations times, and report the minimum

View File

@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
}
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv)
{
if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
}
// Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3);
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3);
img = new unsigned char[width * height * 3];
fimg = new float[width * height * 3];
ao_ispc(width, height, NSUBSAMPLES, fimg);

View File

@@ -37,9 +37,6 @@
#include <assert.h>
#include <math.h>
#include <algorithm>
#ifndef __APPLE__
#include <malloc.h>
#endif // !__APPLE__
using std::max;
#include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
#include "options_ispc.h"
using namespace ispc;
// Allocate memory with 64-byte alignment.
float *AllocFloats(int count) {
int size = count * sizeof(float);
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
float ra[], float va[],
float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
int main() {
// Pointers passed to ispc code must have alignment of the target's
// vector width at minimum.
float *S = AllocFloats(N_OPTIONS);
float *X = AllocFloats(N_OPTIONS);
float *T = AllocFloats(N_OPTIONS);
float *r = AllocFloats(N_OPTIONS);
float *v = AllocFloats(N_OPTIONS);
float *result = AllocFloats(N_OPTIONS);
float *S = new float[N_OPTIONS];
float *X = new float[N_OPTIONS];
float *T = new float[N_OPTIONS];
float *r = new float[N_OPTIONS];
float *v = new float[N_OPTIONS];
float *result = new float[N_OPTIONS];
for (int i = 0; i < N_OPTIONS; ++i) {
S[i] = 100; // stock price

View File

@@ -43,9 +43,6 @@
#include <algorithm>
#include <assert.h>
#include <sys/types.h>
#ifndef __APPLE__
#include <malloc.h>
#endif
#include "../timing.h"
#include "rt_ispc.h"
@@ -53,23 +50,6 @@ using namespace ispc;
typedef unsigned int uint;
template <typename T>
T *AllocAligned(int count) {
int size = count * sizeof(T);
#if defined(_WIN32) || defined(_WIN64)
return (T *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (T *)amem;
#else
return (T *)memalign(64, size);
#endif
}
extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
const float camera2world[4][4], float image[],
int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
uint nNodes;
READ(nNodes, 1);
LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes);
LinearBVHNode *nodes = new LinearBVHNode[nNodes];
for (unsigned int i = 0; i < nNodes; ++i) {
// Each node is 6x floats for a boox, then an integer for an offset
// to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
// And then read the triangles
uint nTris;
READ(nTris, 1);
Triangle *triangles = AllocAligned<Triangle>(nTris);
Triangle *triangles = new Triangle[nTris];
for (uint i = 0; i < nTris; ++i) {
// 9x floats for the 3 vertices
float v[9];