Fixed a number of issues related to memory alignment; a number of places

were expecting vector-width-aligned pointers where in point of fact,
there's no guarantee that they would have been in general.

Removed the aligned memory allocation routines from some of the examples;
they're no longer needed.

No perf. difference on Core2/Core i5 CPUs; older CPUs may see some
regressions.

Still need to update the documentation for this change and finish reviewing
alignment issues in Load/Store instructions generated by .cpp files.
This commit is contained in:
Matt Pharr
2011-06-23 18:18:33 -07:00
parent d340dcbfcc
commit b84167dddd
11 changed files with 45 additions and 112 deletions

View File

@@ -1644,7 +1644,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
return; return;
} }
llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock); llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
4, bblock);
AddDebugPos(inst); AddDebugPos(inst);
} }
@@ -1662,7 +1663,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
// Figure out what kind of store we're doing here // Figure out what kind of store we're doing here
if (rvalueType->IsUniformType()) { if (rvalueType->IsUniformType()) {
// The easy case; a regular store // The easy case; a regular store
llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock); llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
4, bblock);
AddDebugPos(si); AddDebugPos(si);
} }
else if (llvm::isa<const llvm::ArrayType>(lvalue->getType())) else if (llvm::isa<const llvm::ArrayType>(lvalue->getType()))
@@ -1673,7 +1675,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue,
// Otherwise it is a masked store unless we can determine that the // Otherwise it is a masked store unless we can determine that the
// mask is all on... // mask is all on...
llvm::Instruction *si = llvm::Instruction *si =
new llvm::StoreInst(rvalue, lvalue, name, bblock); new llvm::StoreInst(rvalue, lvalue, false /*not volatile*/, 4, bblock);
AddDebugPos(si); AddDebugPos(si);
} }
else else

View File

@@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h)
} }
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 4) { if (argc != 4) {
@@ -136,8 +118,8 @@ int main(int argc, char **argv)
} }
// Allocate space for output images // Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3); img = new unsigned char[width * height * 3];
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); fimg = new float[width * height * 3];
// //
// Run the ispc path, test_iterations times, and report the minimum // Run the ispc path, test_iterations times, and report the minimum

View File

@@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h)
} }
// Allocate memory with 64-byte alignment.
float *
AllocAligned(int size) {
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
if (argc != 4) { if (argc != 4) {
@@ -135,8 +117,8 @@ int main(int argc, char **argv)
} }
// Allocate space for output images // Allocate space for output images
img = (unsigned char *)AllocAligned(width * height * 3); img = new unsigned char[width * height * 3];
fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); fimg = new float[width * height * 3];
ao_ispc(width, height, NSUBSAMPLES, fimg); ao_ispc(width, height, NSUBSAMPLES, fimg);

View File

@@ -37,9 +37,6 @@
#include <assert.h> #include <assert.h>
#include <math.h> #include <math.h>
#include <algorithm> #include <algorithm>
#ifndef __APPLE__
#include <malloc.h>
#endif // !__APPLE__
using std::max; using std::max;
#include "options_defs.h" #include "options_defs.h"
@@ -48,23 +45,6 @@ using std::max;
#include "options_ispc.h" #include "options_ispc.h"
using namespace ispc; using namespace ispc;
// Allocate memory with 64-byte alignment.
float *AllocFloats(int count) {
int size = count * sizeof(float);
#if defined(_WIN32) || defined(_WIN64)
return (float *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (float *)amem;
#else
return (float *)memalign(64, size);
#endif
}
extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], extern void black_scholes_serial(float Sa[], float Xa[], float Ta[],
float ra[], float va[], float ra[], float va[],
float result[], int count); float result[], int count);
@@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[],
int main() { int main() {
// Pointers passed to ispc code must have alignment of the target's // Pointers passed to ispc code must have alignment of the target's
// vector width at minimum. // vector width at minimum.
float *S = AllocFloats(N_OPTIONS); float *S = new float[N_OPTIONS];
float *X = AllocFloats(N_OPTIONS); float *X = new float[N_OPTIONS];
float *T = AllocFloats(N_OPTIONS); float *T = new float[N_OPTIONS];
float *r = AllocFloats(N_OPTIONS); float *r = new float[N_OPTIONS];
float *v = AllocFloats(N_OPTIONS); float *v = new float[N_OPTIONS];
float *result = AllocFloats(N_OPTIONS); float *result = new float[N_OPTIONS];
for (int i = 0; i < N_OPTIONS; ++i) { for (int i = 0; i < N_OPTIONS; ++i) {
S[i] = 100; // stock price S[i] = 100; // stock price

View File

@@ -43,9 +43,6 @@
#include <algorithm> #include <algorithm>
#include <assert.h> #include <assert.h>
#include <sys/types.h> #include <sys/types.h>
#ifndef __APPLE__
#include <malloc.h>
#endif
#include "../timing.h" #include "../timing.h"
#include "rt_ispc.h" #include "rt_ispc.h"
@@ -53,23 +50,6 @@ using namespace ispc;
typedef unsigned int uint; typedef unsigned int uint;
template <typename T>
T *AllocAligned(int count) {
int size = count * sizeof(T);
#if defined(_WIN32) || defined(_WIN64)
return (T *)_aligned_malloc(size, 64);
#elif defined (__APPLE__)
// Allocate excess memory to ensure an aligned pointer can be returned
void *mem = malloc(size + (64-1) + sizeof(void*));
char *amem = ((char*)mem) + sizeof(void*);
amem += 64 - (reinterpret_cast<uint64_t>(amem) & (64 - 1));
((void**)amem)[-1] = mem;
return (T *)amem;
#else
return (T *)memalign(64, size);
#endif
}
extern void raytrace_serial(int width, int height, const float raster2camera[4][4], extern void raytrace_serial(int width, int height, const float raster2camera[4][4],
const float camera2world[4][4], float image[], const float camera2world[4][4], float image[],
int id[], const LinearBVHNode nodes[], int id[], const LinearBVHNode nodes[],
@@ -161,7 +141,7 @@ int main(int argc, char *argv[]) {
uint nNodes; uint nNodes;
READ(nNodes, 1); READ(nNodes, 1);
LinearBVHNode *nodes = AllocAligned<LinearBVHNode>(nNodes); LinearBVHNode *nodes = new LinearBVHNode[nNodes];
for (unsigned int i = 0; i < nNodes; ++i) { for (unsigned int i = 0; i < nNodes; ++i) {
// Each node is 6x floats for a boox, then an integer for an offset // Each node is 6x floats for a boox, then an integer for an offset
// to the second child node, then an integer that encodes the type // to the second child node, then an integer that encodes the type
@@ -181,7 +161,7 @@ int main(int argc, char *argv[]) {
// And then read the triangles // And then read the triangles
uint nTris; uint nTris;
READ(nTris, 1); READ(nTris, 1);
Triangle *triangles = AllocAligned<Triangle>(nTris); Triangle *triangles = new Triangle[nTris];
for (uint i = 0; i < nTris; ++i) { for (uint i = 0; i < nTris; ++i) {
// 9x floats for the 3 vertices // 9x floats for the 3 vertices
float v[9]; float v[9];

11
opt.cpp
View File

@@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) {
} }
else if (maskAsInt == allOnMask) { else if (maskAsInt == allOnMask) {
// The mask is all on, so turn this into a regular store // The mask is all on, so turn this into a regular store
const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0); const llvm::Type *rvalueType = rvalue->getType();
const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0);
// Need to update this when int8/int16 are added
int align = (called == pms32Func || called == pms64Func ||
called == msb32Func) ? 4 : 8;
lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst); lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst);
lCopyMetadata(lvalue, callInst); lCopyMetadata(lvalue, callInst);
llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue); llvm::Instruction *store =
new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
align);
lCopyMetadata(store, callInst); lCopyMetadata(store, callInst);
llvm::ReplaceInstWithInst(callInst, store); llvm::ReplaceInstWithInst(callInst, store);

View File

@@ -513,14 +513,14 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>,
define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<8 x i32>) nounwind alwaysinline { <8 x i32>) nounwind alwaysinline {
%mask_as_float = bitcast <8 x i32> %2 to <8 x float> %mask_as_float = bitcast <8 x i32> %2 to <8 x float>
%oldValue = load <8 x i32>* %0 %oldValue = load <8 x i32>* %0, align 4
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
%newAsFloat = bitcast <8 x i32> %1 to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
%blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat, %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat,
<8 x float> %newAsFloat, <8 x float> %newAsFloat,
<8 x float> %mask_as_float) <8 x float> %mask_as_float)
%blendAsInt = bitcast <8 x float> %blend to <8 x i32> %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
store <8 x i32> %blendAsInt, <8 x i32>* %0 store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
ret void ret void
} }

View File

@@ -278,15 +278,15 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%val = load <4 x i32> * %0 %val = load <4 x i32> * %0, align 4
%newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask)
store <4 x i32> %newval, <4 x i32> * %0 store <4 x i32> %newval, <4 x i32> * %0, align 4
ret void ret void
} }
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr %oldValue = load <4 x i64>* %ptr, align 8
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
; are actually bitcast <2 x i64> values ; are actually bitcast <2 x i64> values
@@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
; reconstruct the final <4 x i64> vector ; reconstruct the final <4 x i64> vector
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23, %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i64> %final, <4 x i64> * %ptr store <4 x i64> %final, <4 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -188,21 +188,21 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>,
<4 x i32> %mask) nounwind alwaysinline { <4 x i32> %mask) nounwind alwaysinline {
%mask_as_float = bitcast <4 x i32> %mask to <4 x float> %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
%oldValue = load <4 x i32>* %0 %oldValue = load <4 x i32>* %0, align 4
%oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
%newAsFloat = bitcast <4 x i32> %1 to <4 x float> %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
%blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
<4 x float> %newAsFloat, <4 x float> %newAsFloat,
<4 x float> %mask_as_float) <4 x float> %mask_as_float)
%blendAsInt = bitcast <4 x float> %blend to <4 x i32> %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
store <4 x i32> %blendAsInt, <4 x i32>* %0 store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
ret void ret void
} }
define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
<4 x i32> %i32mask) nounwind alwaysinline { <4 x i32> %i32mask) nounwind alwaysinline {
%oldValue = load <4 x i64>* %ptr %oldValue = load <4 x i64>* %ptr, align 8
%mask = bitcast <4 x i32> %i32mask to <4 x float> %mask = bitcast <4 x i32> %i32mask to <4 x float>
; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values
@@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new,
; reconstruct the final <4 x i64> vector ; reconstruct the final <4 x i64> vector
%final = shufflevector <2 x i64> %result01, <2 x i64> %result23, %final = shufflevector <2 x i64> %result01, <2 x i64> %result23,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x i64> %final, <4 x i64> * %ptr store <4 x i64> %final, <4 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef, %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef,
<4 x i32> <i32 4, i32 5, i32 6, i32 7> <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%oldValue = load <8 x i32>* %0 %oldValue = load <8 x i32>* %0, align 4
%oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float>
%newAsFloat = bitcast <8 x i32> %1 to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float>
%old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef, %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef,
@@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>,
%blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b, %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%blendAsInt = bitcast <8 x float> %blend to <8 x i32> %blendAsInt = bitcast <8 x float> %blend to <8 x i32>
store <8 x i32> %blendAsInt, <8 x i32>* %0 store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4
ret void ret void
} }
@@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
%mask_as_float = bitcast <8 x i32> %mask to <8 x float> %mask_as_float = bitcast <8 x i32> %mask to <8 x float>
%old = load <8 x i64>* %ptr %old = load <8 x i64>* %ptr, align 8
; set up the first two 64-bit values ; set up the first two 64-bit values
%old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1> %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new,
<4 x i32> <i32 0, i32 1, i32 2, i32 3> <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567, %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
store <8 x i64> %final, <8 x i64> * %ptr store <8 x i64> %final, <8 x i64> * %ptr, align 8
ret void ret void
} }

View File

@@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw
%ptr16 = bitcast [0 x i32] *%0 to i16 * %ptr16 = bitcast [0 x i32] *%0 to i16 *
%ptr = getelementptr i16 * %ptr16, i32 %offset %ptr = getelementptr i16 * %ptr16, i32 %offset
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
%val = load i`'eval(16*$1) * %ptr64, align 1 %val = load i`'eval(16*$1) * %ptr64, align 2
%vval = bitcast i`'eval(16*$1) %val to <$1 x i16> %vval = bitcast i`'eval(16*$1) %val to <$1 x i16>
; unsigned, so use zero-extent... ; unsigned, so use zero-extent...
@@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32,
%oldmasked = and i`'eval(8*$1) %old, %notmask %oldmasked = and i`'eval(8*$1) %old, %notmask
%newmasked = and i`'eval(8*$1) %val64, %mask64 %newmasked = and i`'eval(8*$1) %val64, %mask64
%final = or i`'eval(8*$1) %oldmasked, %newmasked %final = or i`'eval(8*$1) %oldmasked, %newmasked
store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64 store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1
ret void ret void
} }
@@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32
%ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) *
;; as above, use mask to do blending with logical ops... ;; as above, use mask to do blending with logical ops...
%old = load i`'eval(16*$1) * %ptr64, align 1 %old = load i`'eval(16*$1) * %ptr64, align 2
%oldmasked = and i`'eval(16*$1) %old, %notmask %oldmasked = and i`'eval(16*$1) %old, %notmask
%newmasked = and i`'eval(16*$1) %val64, %mask64 %newmasked = and i`'eval(16*$1) %val64, %mask64
%final = or i`'eval(16*$1) %oldmasked, %newmasked %final = or i`'eval(16*$1) %oldmasked, %newmasked
store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64 store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2
ret void ret void
} }
@@ -544,7 +544,7 @@ all_on:
;; vector load ;; vector load
%vecptr = bitcast i32 *%startptr to <$1 x i32> * %vecptr = bitcast i32 *%startptr to <$1 x i32> *
%vec_load = load <$1 x i32> *%vecptr, align 4 %vec_load = load <$1 x i32> *%vecptr, align 4
store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4
ret i32 $1 ret i32 $1
not_all_on: not_all_on: