From 2405dae8e683bafe33f281f4a2715cbe5e7cf04f Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Sat, 17 Sep 2011 13:36:38 -0700 Subject: [PATCH] Use malloc() to get space for task arguments when compiling to AVX. This is to work around the LLVM bug/limitation discused in LLVM bug 10841 (http://llvm.org/bugs/show_bug.cgi?id=10841). --- ctx.cpp | 21 +++++++++++++------ examples/tasks_gcd.cpp | 27 ++++++++++++++++++++++++ examples/tasks_pthreads.cpp | 42 +++++++++++++++++++++++++++++++++++++ ispc_test.cpp | 35 +++++++++++++++++++++++++++---- module.cpp | 5 +++++ opt.cpp | 4 ++++ test_static.cpp | 42 +++++++++++++++++++++++++++++++++++++ 7 files changed, 166 insertions(+), 10 deletions(-) diff --git a/ctx.cpp b/ctx.cpp index e6bbde3d..20dc702a 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -1961,17 +1961,26 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee, assert(argStructType->getNumElements() == argVals.size() + 1); int align = 4 * RoundUpPow2(g->target.nativeVectorWidth); + llvm::Value *argmem; #ifdef ISPC_IS_WINDOWS // Use malloc() to allocate storage on Windows, since the stack is // generally not big enough there to do enough allocations for lots of // tasks and then things crash horribly... - llvm::Value *argmem = EmitMalloc(argStructType, align); + argmem = EmitMalloc(argStructType, align); #else - // Use alloca for space for the task args on OSX And Linux. KEY - // DETAIL: pass false to the call of FunctionEmitContext::AllocaInst so - // that the alloca doesn't happen just once at the top of the function, - // but happens each time the enclosing basic block executes. - llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false); + // Otherwise, use alloca for space for the task args, ** unless we're + // compiling to AVX, in which case we use malloc after all **. (See + // http://llvm.org/bugs/show_bug.cgi?id=10841 for details. There are + // limitations in LLVM with respect to dynamic allocas of this sort + // when the stack also has to be 32-byte aligned...). + if (g->target.isa == Target::AVX) + argmem = EmitMalloc(argStructType, align); + else + // KEY DETAIL: pass false to the call of + // FunctionEmitContext::AllocaInst so that the alloca doesn't + // happen just once at the top of the function, but happens each + // time the enclosing basic block executes. + argmem = AllocaInst(argStructType, "argmem", align, false); #endif // ISPC_IS_WINDOWS llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType); diff --git a/examples/tasks_gcd.cpp b/examples/tasks_gcd.cpp index 99e616a0..16c871f0 100644 --- a/examples/tasks_gcd.cpp +++ b/examples/tasks_gcd.cpp @@ -33,10 +33,20 @@ #include "taskinfo.h" +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + /* A simple task system for ispc programs based on Apple's Grand Central Dispatch. */ #include #include +#include +#include static int initialized = 0; static volatile int32_t lock = 0; @@ -47,6 +57,8 @@ static dispatch_group_t gcdGroup; extern "C" { void ISPCLaunch(void *f, void *data); void ISPCSync(); + void *ISPCMalloc(int64_t size, int32_t alignment); + void ISPCFree(void *ptr); } @@ -97,3 +109,18 @@ void ISPCSync() { lResetTaskInfo(); } + +void *ISPCMalloc(int64_t size, int32_t alignment) { + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +} + + +void ISPCFree(void *ptr) { + free(((void**)ptr)[-1]); +} + diff --git a/examples/tasks_pthreads.cpp b/examples/tasks_pthreads.cpp index 310db4d9..6e4e6fcb 100644 --- a/examples/tasks_pthreads.cpp +++ b/examples/tasks_pthreads.cpp @@ -31,6 +31,14 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + #include "taskinfo.h" #include #include @@ -63,6 +71,8 @@ static pthread_cond_t tasksRunningCondition; extern "C" { void ISPCLaunch(void *f, void *data); void ISPCSync(); + void *ISPCMalloc(int64_t size, int32_t alignment); + void ISPCFree(void *ptr); } static void *lTaskEntry(void *arg); @@ -292,3 +302,35 @@ void ISPCSync() { exit(1); } } + + +void *ISPCMalloc(int64_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS + return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif +} + + +void ISPCFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS + _aligned_free(ptr); +#endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} + diff --git a/ispc_test.cpp b/ispc_test.cpp index 790f79f1..d6a45ef3 100644 --- a/ispc_test.cpp +++ b/ispc_test.cpp @@ -33,12 +33,22 @@ #define _CRT_SECURE_NO_WARNINGS +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + #ifdef ISPC_IS_WINDOWS #define NOMINMAX #include #endif #include #include +#include +#include #ifdef ISPC_HAVE_SVML #include @@ -103,16 +113,35 @@ void ISPCSync() { } -#ifdef ISPC_IS_WINDOWS void *ISPCMalloc(int64_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif } void ISPCFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS _aligned_free(ptr); -} #endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} static void usage(int ret) { fprintf(stderr, "usage: ispc_test\n"); @@ -186,10 +215,8 @@ static bool lRunTest(const char *fn) { ee->addGlobalMapping(func, (void *)FUNC) DO_FUNC(ISPCLaunch, "ISPCLaunch"); DO_FUNC(ISPCSync, "ISPCSync"); -#ifdef ISPC_IS_WINDOWS DO_FUNC(ISPCMalloc, "ISPCMalloc"); DO_FUNC(ISPCFree, "ISPCFree"); -#endif // ISPC_IS_WINDOWS DO_FUNC(putchar, "putchar"); DO_FUNC(printf, "printf"); DO_FUNC(fflush, "fflush"); diff --git a/module.cpp b/module.cpp index b01f25ac..c2a4703b 100644 --- a/module.cpp +++ b/module.cpp @@ -659,6 +659,11 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function, // the code to free that memory, now that we've copied the // parameter values out of the structure. ctx->EmitFree(structParamPtr); +#else + // We also do this for AVX... (See discussion in + // FunctionEmitContext::LaunchInst().) + if (g->target.isa == Target::AVX) + ctx->EmitFree(structParamPtr); #endif // ISPC_IS_WINDOWS } else { diff --git a/opt.cpp b/opt.cpp index 1b5c4cb9..7a471e89 100644 --- a/opt.cpp +++ b/opt.cpp @@ -187,6 +187,10 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(targetLibraryInfo); optPM.add(new llvm::TargetData(module)); +#if defined(LLVM_3_0) || defined(LLVM_3_0svn) + optPM.add(llvm::createIndVarSimplifyPass()); +#endif + if (optLevel == 0) { // This is more or less the minimum set of optimizations that we // need to do to generate code that will actually run. (We can't diff --git a/test_static.cpp b/test_static.cpp index 0ee4810a..8945b816 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -31,9 +31,18 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + #include #include #include +#include extern "C" { extern int width(); @@ -48,6 +57,8 @@ extern "C" { void ISPCLaunch(void *f, void *d); void ISPCSync(); + void *ISPCMalloc(int64_t size, int32_t alignment); + void ISPCFree(void *ptr); } void ISPCLaunch(void *f, void *d) { @@ -60,6 +71,37 @@ void ISPCSync() { } +void *ISPCMalloc(int64_t size, int32_t alignment) { +#ifdef ISPC_IS_WINDOWS + return _aligned_malloc(size, alignment); +#endif +#ifdef ISPC_IS_LINUX + return memalign(alignment, size); +#endif +#ifdef ISPC_IS_APPLE + void *mem = malloc(size + (alignment-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & + (alignment - 1))); + ((void**)amem)[-1] = mem; + return amem; +#endif +} + + +void ISPCFree(void *ptr) { +#ifdef ISPC_IS_WINDOWS + _aligned_free(ptr); +#endif +#ifdef ISPC_IS_LINUX + free(ptr); +#endif +#ifdef ISPC_IS_APPLE + free(((void**)ptr)[-1]); +#endif +} + + int main(int argc, char *argv[]) { int w = width(); assert(w <= 16);