From b84167ddddb5870257db217e7d921101f5022ea4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 23 Jun 2011 18:18:33 -0700 Subject: [PATCH 1/2] Fixed a number of issues related to memory alignment; a number of places were expecting vector-width-aligned pointers where in point of fact, there's no guarantee that they would have been in general. Removed the aligned memory allocation routines from some of the examples; they're no longer needed. No perf. difference on Core2/Core i5 CPUs; older CPUs may see some regressions. Still need to update the documentation for this change and finish reviewing alignment issues in Load/Store instructions generated by .cpp files. --- ctx.cpp | 8 ++++--- examples/aobench/ao.cpp | 22 ++----------------- examples/aobench_instrumented/ao.cpp | 22 ++----------------- examples/options/options.cpp | 32 ++++++---------------------- examples/rt/rt.cpp | 24 ++------------------- opt.cpp | 11 ++++++++-- stdlib-avx.ll | 4 ++-- stdlib-sse2.ll | 8 +++---- stdlib-sse4.ll | 8 +++---- stdlib-sse4x2.ll | 8 +++---- stdlib.m4 | 10 ++++----- 11 files changed, 45 insertions(+), 112 deletions(-) diff --git a/ctx.cpp b/ctx.cpp index 27e2f0f4..9c72fd3d 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1644,7 +1644,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, return; } - llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock); + llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, + 4, bblock); AddDebugPos(inst); } @@ -1662,7 +1663,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, // Figure out what kind of store we're doing here if (rvalueType->IsUniformType()) { // The easy case; a regular store - llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock); + llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, + 4, bblock); AddDebugPos(si); } else if (llvm::isa(lvalue->getType())) @@ -1673,7 +1675,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, // Otherwise it is a masked store unless we can determine that the // mask is all on... llvm::Instruction *si = - new llvm::StoreInst(rvalue, lvalue, name, bblock); + new llvm::StoreInst(rvalue, lvalue, false /*not volatile*/, 4, bblock); AddDebugPos(si); } else diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index 1a2eefe5..48cfca42 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -103,24 +103,6 @@ savePPM(const char *fname, int w, int h) } -// Allocate memory with 64-byte alignment. -float * -AllocAligned(int size) { -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - - int main(int argc, char **argv) { if (argc != 4) { @@ -136,8 +118,8 @@ int main(int argc, char **argv) } // Allocate space for output images - img = (unsigned char *)AllocAligned(width * height * 3); - fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + img = new unsigned char[width * height * 3]; + fimg = new float[width * height * 3]; // // Run the ispc path, test_iterations times, and report the minimum diff --git a/examples/aobench_instrumented/ao.cpp b/examples/aobench_instrumented/ao.cpp index 742a0862..5037bdc4 100644 --- a/examples/aobench_instrumented/ao.cpp +++ b/examples/aobench_instrumented/ao.cpp @@ -102,24 +102,6 @@ savePPM(const char *fname, int w, int h) } -// Allocate memory with 64-byte alignment. -float * -AllocAligned(int size) { -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - - int main(int argc, char **argv) { if (argc != 4) { @@ -135,8 +117,8 @@ int main(int argc, char **argv) } // Allocate space for output images - img = (unsigned char *)AllocAligned(width * height * 3); - fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + img = new unsigned char[width * height * 3]; + fimg = new float[width * height * 3]; ao_ispc(width, height, NSUBSAMPLES, fimg); diff --git a/examples/options/options.cpp b/examples/options/options.cpp index 241b32be..86c55dae 100644 --- a/examples/options/options.cpp +++ b/examples/options/options.cpp @@ -37,9 +37,6 @@ #include #include #include -#ifndef __APPLE__ -#include -#endif // !__APPLE__ using std::max; #include "options_defs.h" @@ -48,23 +45,6 @@ using std::max; #include "options_ispc.h" using namespace ispc; -// Allocate memory with 64-byte alignment. -float *AllocFloats(int count) { - int size = count * sizeof(float); -#if defined(_WIN32) || defined(_WIN64) - return (float *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (float *)amem; -#else - return (float *)memalign(64, size); -#endif -} - extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], float ra[], float va[], float result[], int count); @@ -76,12 +56,12 @@ extern void binomial_put_serial(float Sa[], float Xa[], float Ta[], int main() { // Pointers passed to ispc code must have alignment of the target's // vector width at minimum. - float *S = AllocFloats(N_OPTIONS); - float *X = AllocFloats(N_OPTIONS); - float *T = AllocFloats(N_OPTIONS); - float *r = AllocFloats(N_OPTIONS); - float *v = AllocFloats(N_OPTIONS); - float *result = AllocFloats(N_OPTIONS); + float *S = new float[N_OPTIONS]; + float *X = new float[N_OPTIONS]; + float *T = new float[N_OPTIONS]; + float *r = new float[N_OPTIONS]; + float *v = new float[N_OPTIONS]; + float *result = new float[N_OPTIONS]; for (int i = 0; i < N_OPTIONS; ++i) { S[i] = 100; // stock price diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp index e589bd94..d7a19285 100644 --- a/examples/rt/rt.cpp +++ b/examples/rt/rt.cpp @@ -43,9 +43,6 @@ #include #include #include -#ifndef __APPLE__ -#include -#endif #include "../timing.h" #include "rt_ispc.h" @@ -53,23 +50,6 @@ using namespace ispc; typedef unsigned int uint; -template -T *AllocAligned(int count) { - int size = count * sizeof(T); -#if defined(_WIN32) || defined(_WIN64) - return (T *)_aligned_malloc(size, 64); -#elif defined (__APPLE__) - // Allocate excess memory to ensure an aligned pointer can be returned - void *mem = malloc(size + (64-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem += 64 - (reinterpret_cast(amem) & (64 - 1)); - ((void**)amem)[-1] = mem; - return (T *)amem; -#else - return (T *)memalign(64, size); -#endif -} - extern void raytrace_serial(int width, int height, const float raster2camera[4][4], const float camera2world[4][4], float image[], int id[], const LinearBVHNode nodes[], @@ -161,7 +141,7 @@ int main(int argc, char *argv[]) { uint nNodes; READ(nNodes, 1); - LinearBVHNode *nodes = AllocAligned(nNodes); + LinearBVHNode *nodes = new LinearBVHNode[nNodes]; for (unsigned int i = 0; i < nNodes; ++i) { // Each node is 6x floats for a boox, then an integer for an offset // to the second child node, then an integer that encodes the type @@ -181,7 +161,7 @@ int main(int argc, char *argv[]) { // And then read the triangles uint nTris; READ(nTris, 1); - Triangle *triangles = AllocAligned(nTris); + Triangle *triangles = new Triangle[nTris]; for (uint i = 0; i < nTris; ++i) { // 9x floats for the 3 vertices float v[9]; diff --git a/opt.cpp b/opt.cpp index 583e8324..69e75247 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1131,10 +1131,17 @@ MaskedStoreOptPass::runOnBasicBlock(llvm::BasicBlock &bb) { } else if (maskAsInt == allOnMask) { // The mask is all on, so turn this into a regular store - const llvm::Type *ptrType = llvm::PointerType::get(rvalue->getType(), 0); + const llvm::Type *rvalueType = rvalue->getType(); + const llvm::Type *ptrType = llvm::PointerType::get(rvalueType, 0); + // Need to update this when int8/int16 are added + int align = (called == pms32Func || called == pms64Func || + called == msb32Func) ? 4 : 8; + lvalue = new llvm::BitCastInst(lvalue, ptrType, "lvalue_to_ptr_type", callInst); lCopyMetadata(lvalue, callInst); - llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue); + llvm::Instruction *store = + new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, + align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); diff --git a/stdlib-avx.ll b/stdlib-avx.ll index 3d125a7e..fff3719f 100644 --- a/stdlib-avx.ll +++ b/stdlib-avx.ll @@ -513,14 +513,14 @@ declare <8 x float> @llvm.x86.avx.blendvps(<8 x float>, <8 x float>, define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <8 x i32>) nounwind alwaysinline { %mask_as_float = bitcast <8 x i32> %2 to <8 x float> - %oldValue = load <8 x i32>* %0 + %oldValue = load <8 x i32>* %0, align 4 %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float> %blend = call <8 x float> @llvm.x86.avx.blendvps(<8 x float> %oldAsFloat, <8 x float> %newAsFloat, <8 x float> %mask_as_float) %blendAsInt = bitcast <8 x float> %blend to <8 x i32> - store <8 x i32> %blendAsInt, <8 x i32>* %0 + store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4 ret void } diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll index 654e81f1..c37fdfb5 100644 --- a/stdlib-sse2.ll +++ b/stdlib-sse2.ll @@ -278,15 +278,15 @@ define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwa define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32> %mask) nounwind alwaysinline { - %val = load <4 x i32> * %0 + %val = load <4 x i32> * %0, align 4 %newval = call <4 x i32> @__vselect_i32(<4 x i32> %val, <4 x i32> %1, <4 x i32> %mask) - store <4 x i32> %newval, <4 x i32> * %0 + store <4 x i32> %newval, <4 x i32> * %0, align 4 ret void } define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, <4 x i32> %mask) nounwind alwaysinline { - %oldValue = load <4 x i64>* %ptr + %oldValue = load <4 x i64>* %ptr, align 8 ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values ; are actually bitcast <2 x i64> values @@ -322,7 +322,7 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, ; reconstruct the final <4 x i64> vector %final = shufflevector <2 x i64> %result01, <2 x i64> %result23, <4 x i32> - store <4 x i64> %final, <4 x i64> * %ptr + store <4 x i64> %final, <4 x i64> * %ptr, align 8 ret void } diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll index f28dc35d..30b6f43b 100644 --- a/stdlib-sse4.ll +++ b/stdlib-sse4.ll @@ -188,21 +188,21 @@ declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, define void @__masked_store_blend_32(<4 x i32>* nocapture, <4 x i32>, <4 x i32> %mask) nounwind alwaysinline { %mask_as_float = bitcast <4 x i32> %mask to <4 x float> - %oldValue = load <4 x i32>* %0 + %oldValue = load <4 x i32>* %0, align 4 %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float> %newAsFloat = bitcast <4 x i32> %1 to <4 x float> %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat, <4 x float> %newAsFloat, <4 x float> %mask_as_float) %blendAsInt = bitcast <4 x float> %blend to <4 x i32> - store <4 x i32> %blendAsInt, <4 x i32>* %0 + store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4 ret void } define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, <4 x i32> %i32mask) nounwind alwaysinline { - %oldValue = load <4 x i64>* %ptr + %oldValue = load <4 x i64>* %ptr, align 8 %mask = bitcast <4 x i32> %i32mask to <4 x float> ; Do 4x64-bit blends by doing two <4 x i32> blends, where the <4 x i32> values @@ -243,6 +243,6 @@ define void @__masked_store_blend_64(<4 x i64>* nocapture %ptr, <4 x i64> %new, ; reconstruct the final <4 x i64> vector %final = shufflevector <2 x i64> %result01, <2 x i64> %result23, <4 x i32> - store <4 x i64> %final, <4 x i64> * %ptr + store <4 x i64> %final, <4 x i64> * %ptr, align 8 ret void } diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll index c97fd8ce..009c1c5b 100644 --- a/stdlib-sse4x2.ll +++ b/stdlib-sse4x2.ll @@ -566,7 +566,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, <4 x i32> %mask_b = shufflevector <8 x float> %mask_as_float, <8 x float> undef, <4 x i32> - %oldValue = load <8 x i32>* %0 + %oldValue = load <8 x i32>* %0, align 4 %oldAsFloat = bitcast <8 x i32> %oldValue to <8 x float> %newAsFloat = bitcast <8 x i32> %1 to <8 x float> %old_a = shufflevector <8 x float> %oldAsFloat, <8 x float> undef, @@ -584,7 +584,7 @@ define void @__masked_store_blend_32(<8 x i32>* nocapture, <8 x i32>, %blend = shufflevector <4 x float> %blend_a, <4 x float> %blend_b, <8 x i32> %blendAsInt = bitcast <8 x float> %blend to <8 x i32> - store <8 x i32> %blendAsInt, <8 x i32>* %0 + store <8 x i32> %blendAsInt, <8 x i32>* %0, align 4 ret void } @@ -595,7 +595,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, %mask_as_float = bitcast <8 x i32> %mask to <8 x float> - %old = load <8 x i64>* %ptr + %old = load <8 x i64>* %ptr, align 8 ; set up the first two 64-bit values %old01 = shufflevector <8 x i64> %old, <8 x i64> undef, <2 x i32> @@ -651,7 +651,7 @@ define void @__masked_store_blend_64(<8 x i64>* nocapture %ptr, <8 x i64> %new, <4 x i32> %final = shufflevector <4 x i64> %final0123, <4 x i64> %final4567, <8 x i32> - store <8 x i64> %final, <8 x i64> * %ptr + store <8 x i64> %final, <8 x i64> * %ptr, align 8 ret void } diff --git a/stdlib.m4 b/stdlib.m4 index b098e131..500d183c 100644 --- a/stdlib.m4 +++ b/stdlib.m4 @@ -452,7 +452,7 @@ define internal <$1 x i32> @__load_uint16([0 x i32] *, i32 %offset) nounwind alw %ptr16 = bitcast [0 x i32] *%0 to i16 * %ptr = getelementptr i16 * %ptr16, i32 %offset %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * - %val = load i`'eval(16*$1) * %ptr64, align 1 + %val = load i`'eval(16*$1) * %ptr64, align 2 %vval = bitcast i`'eval(16*$1) %val to <$1 x i16> ; unsigned, so use zero-extent... @@ -479,7 +479,7 @@ define internal void @__store_uint8([0 x i32] *, i32 %offset, <$1 x i32> %val32, %oldmasked = and i`'eval(8*$1) %old, %notmask %newmasked = and i`'eval(8*$1) %val64, %mask64 %final = or i`'eval(8*$1) %oldmasked, %newmasked - store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64 + store i`'eval(8*$1) %final, i`'eval(8*$1) * %ptr64, align 1 ret void } @@ -498,11 +498,11 @@ define internal void @__store_uint16([0 x i32] *, i32 %offset, <$1 x i32> %val32 %ptr64 = bitcast i16 * %ptr to i`'eval(16*$1) * ;; as above, use mask to do blending with logical ops... - %old = load i`'eval(16*$1) * %ptr64, align 1 + %old = load i`'eval(16*$1) * %ptr64, align 2 %oldmasked = and i`'eval(16*$1) %old, %notmask %newmasked = and i`'eval(16*$1) %val64, %mask64 %final = or i`'eval(16*$1) %oldmasked, %newmasked - store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64 + store i`'eval(16*$1) %final, i`'eval(16*$1) * %ptr64, align 2 ret void } @@ -544,7 +544,7 @@ all_on: ;; vector load %vecptr = bitcast i32 *%startptr to <$1 x i32> * %vec_load = load <$1 x i32> *%vecptr, align 4 - store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr + store <$1 x i32> %vec_load, <$1 x i32> * %val_ptr, align 4 ret i32 $1 not_all_on: From 865e430b56e0e926d516fbff83261d4eb95343b4 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Thu, 23 Jun 2011 18:51:15 -0700 Subject: [PATCH 2/2] Finished updating alignment issues for vector types; don't assume pointers are aligned to the natural vector width. --- ctx.cpp | 38 +++++++++++++++------ docs/ispc.txt | 70 ++++++++------------------------------ examples/simple/simple.cpp | 10 +----- 3 files changed, 44 insertions(+), 74 deletions(-) diff --git a/ctx.cpp b/ctx.cpp index 9c72fd3d..12199dcb 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1315,8 +1315,21 @@ FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type, if (llvm::isa(lvalue->getType())) { // If the lvalue is a straight up regular pointer, then just issue - // a regular load - llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock); + // a regular load. First figure out the alignment; in general we + // can just assume the natural alignment (0 here), but for varying + // atomic types, we need to make sure that the compiler emits + // unaligned vector loads, so we specify a reduced alignment here. + int align = 0; + const AtomicType *atomicType = dynamic_cast(type); + if (atomicType != NULL && atomicType->IsVaryingType()) + // We actually just want to align to the vector element + // alignment, but can't easily get that here, so just tell LLVM + // it's totally unaligned. (This shouldn't make any difference + // vs the proper alignment in practice.) + align = 1; + llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", + false /* not volatile */, + align, bblock); AddDebugPos(inst); return inst; } @@ -1644,8 +1657,16 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, return; } - llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - 4, bblock); + llvm::Instruction *inst; + if (llvm::isa(rvalue->getType())) + // Specify an unaligned store, since we don't know that the lvalue + // will in fact be aligned to a vector width here. (Actually + // should be aligned to the alignment of the vector elment type...) + inst = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, + 1, bblock); + else + inst = new llvm::StoreInst(rvalue, lvalue, bblock); + AddDebugPos(inst); } @@ -1662,9 +1683,8 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, // Figure out what kind of store we're doing here if (rvalueType->IsUniformType()) { - // The easy case; a regular store - llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - 4, bblock); + // The easy case; a regular store, natural alignment is fine + llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, bblock); AddDebugPos(si); } else if (llvm::isa(lvalue->getType())) @@ -1674,9 +1694,7 @@ FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, else if (storeMask == LLVMMaskAllOn) { // Otherwise it is a masked store unless we can determine that the // mask is all on... - llvm::Instruction *si = - new llvm::StoreInst(rvalue, lvalue, false /*not volatile*/, 4, bblock); - AddDebugPos(si); + StoreInst(rvalue, lvalue, name); } else maskedStore(rvalue, lvalue, rvalueType, storeMask); diff --git a/docs/ispc.txt b/docs/ispc.txt index 76f13595..8292f478 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -1970,7 +1970,7 @@ Data Layout In general, ``ispc`` tries to ensure that ``struct`` s and other complex datatypes are laid out in the same way in memory as they are in C/C++. -Matching alignment is important for easy interoperability between C/C++ +Matching structure layout is important for easy interoperability between C/C++ code and ``ispc`` code. The main complexity in sharing data between ``ispc`` and C/C++ often comes @@ -2023,11 +2023,6 @@ It can pass ``array`` to a ``ispc`` function defined as: export void foo(uniform float array[], uniform int count) -(Though the pointer must be aligned to the compilation target's natural -vector width; see the discussion of alignment restrictions in `Data -Alignment and Aliasing`_ and the aligned allocation routines in -``examples/options/options.cpp`` for example.) - Similarly, ``struct`` s from the application can have embedded pointers. This is handled with similar ``[]`` syntax: @@ -2062,55 +2057,20 @@ vector types from C/C++ application code if possible. Data Alignment and Aliasing --------------------------- -There are two important constraints that must be adhered to when passing -pointers from the application to ``ispc`` programs. +There are are two important constraints that must be adhered to when +passing pointers from the application to ``ispc`` programs. -The first constraint is alignment: any pointers from the host program that -are passed to ``ispc`` must be aligned to natural vector alignment of -system--for example, 16 byte alignment on a target that supports IntelĀ® -SSE, 32-byte on an IntelĀ® AVX target. If this constraint isn't met, the -program may abort at runtime with an unaligned memory access error. +The first is that it is required that it be valid to read memory at the +first element of any array that is passed to ``ispc``. In practice, this +should just happen naturally, but it does mean that it is illegal to pass a +``NULL`` pointer as a parameter to a ``ispc`` function called from the +application. -For example, in a ``ispc`` function with the following declaration: - -:: - - export void foo(uniform float in[], uniform float out[], - int count); - -If the application is passing stack-allocated arrays for ``in`` and -``out``, these C/C++ compiler must be told to align these arrays. - -:: - - // MSVC, SSE target - __declspec(align(16)) float in[16], out[16]; - foo(in, out, 16); - -With the gcc/clang compilers, the syntax for providing alignment is -slightly different: - -:: - - float x[16] __attribute__ ((__align__(16))); - foo(in, out, 16); - -If the data being passed is dynamically allocated, the appropriate system -aligned memory allocation routine should be used to allocate it (for -example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with -Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for -an example.) - -It is also required that it be valid to read memory at the first element of -any array that is passed to ``ispc``. In practice, this should just -happen naturally, but it does mean that it is illegal to pass a ``NULL`` -pointer as a parameter to a ``ispc`` function called from the application. - -The second key constraint is that pointers and references in ``ispc`` -programs must not alias. The ``ispc`` compiler assumes that different -pointers can't end up pointing to the same memory location, either due to -having the same initial value, or through array indexing in the program as -it executed. +The second constraint is that pointers and references in ``ispc`` programs +must not alias. The ``ispc`` compiler assumes that different pointers +can't end up pointing to the same memory location, either due to having the +same initial value, or through array indexing in the program as it +executed. This aliasing constraint also applies to ``reference`` parameters to functions. Given a function like: @@ -2127,8 +2087,8 @@ another case of aliasing, and if the caller calls the function as ``func(x, x)``, it's not guaranteed that the ``if`` test will evaluate to true, due to the compiler's requirement of no aliasing. -(In the future, ``ispc`` will have the ability to work with unaligned -memory as well as have a mechanism to indicate that pointers may alias.) +(In the future, ``ispc`` will have a mechanism to indicate that pointers +may alias.) Using ISPC Effectively ====================== diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 3b5bf028..9542532f 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -38,15 +38,7 @@ using namespace ispc; int main() { - // Pointers passed to ispc-compiled code are currently required to have - // alignment equal to the target's native vector size. Here we align - // to 32 bytes to be safe for both SSE and AVX targets. -#ifdef _MSC_VER - __declspec(align(32)) float vin[16], vout[16]; -#else - float vin[16] __attribute__((aligned(32))); - float vout[16] __attribute__((aligned(32))); -#endif + float vin[16], vout[16]; // Initialize input buffer for (int i = 0; i < 16; ++i)