diff --git a/examples/portable/common_cpu.mk b/examples/portable/common_cpu.mk index 31048390..76927848 100644 --- a/examples/portable/common_cpu.mk +++ b/examples/portable/common_cpu.mk @@ -1,7 +1,7 @@ -TASK_CXX=../../tasksys.cpp ../../util/ispc_malloc.cpp +TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp TASK_LIB=-lpthread -TASK_OBJ=objs/tasksys.o objs/ispc_malloc.o +TASK_OBJ=objs/omp_tasksys.o objs/ispc_malloc.o CXX=clang++ CXX=icc -openmp @@ -85,6 +85,8 @@ objs/%.o: %.cpp dirs $(ISPC_HEADER) objs/%.o: %.c dirs $(ISPC_HEADER) $(CC) $< $(CCFLAGS) -c -o $@ +objs/%.o: ../%.cpp dirs + $(CXX) $< $(CXXFLAGS) -c -o $@ objs/%.o: ../../%.cpp dirs $(CXX) $< $(CXXFLAGS) -c -o $@ objs/%.o: ../../util/%.cpp dirs diff --git a/examples/portable/common_knc.mk b/examples/portable/common_knc.mk index e5cf89ae..5335dd7b 100644 --- a/examples/portable/common_knc.mk +++ b/examples/portable/common_knc.mk @@ -1,5 +1,5 @@ -TASK_CXX=../../tasksys.cpp ../../util/ispc_malloc.cpp -TASK_OBJ=objs_knc/tasksys.o objs_knc/ispc_malloc.o +TASK_CXX=../omp_tasksys.cpp ../../util/ispc_malloc.cpp +TASK_OBJ=objs_knc/omp_tasksys.o objs_knc/ispc_malloc.o TASK_LIB=-openmp CXX=icc -openmp -mmic @@ -39,9 +39,10 @@ $(PROG): $(ISPC_OBJ) $(CXX_OBJ) objs_knc/%.o: %.cpp $(CXX) $(CXXFLAGS) -o $@ -c $< +objs_knc/%.o: ../%.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< objs_knc/%.o: ../../%.cpp $(CXX) $(CXXFLAGS) -o $@ -c $< - objs_knc/%.o: ../../util/%.cpp $(CXX) $(CXXFLAGS) -o $@ -c $< diff --git a/examples/portable/omp_tasksys.cpp b/examples/portable/omp_tasksys.cpp new file mode 100644 index 00000000..8407a496 --- /dev/null +++ b/examples/portable/omp_tasksys.cpp @@ -0,0 +1,409 @@ +/* + Copyright (c) 2011-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +#define DBG(x) +#include +#include + +#include +#include +#include +#include +#include +#include + +// Signature of ispc-generated 'task' functions +typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, + int taskIndex, int taskCount, + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); + +// Small structure used to hold the data for each task +#ifdef _MSC_VER +__declspec(align(16)) +#endif +struct TaskInfo { + TaskFuncType func; + void *data; + int taskIndex; + int taskCount3d[3]; +#if defined(ISPC_IS_WINDOWS) + event taskEvent; +#endif + int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } + int taskIndex0() const + { + return taskIndex % taskCount3d[0]; + } + int taskIndex1() const + { + return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; + } + int taskIndex2() const + { + return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); + } + int taskCount0() const { return taskCount3d[0]; } + int taskCount1() const { return taskCount3d[1]; } + int taskCount2() const { return taskCount3d[2]; } + TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } +} +#ifndef _MSC_VER +__attribute__((aligned(32))); +#endif +; + +// ispc expects these functions to have C linkage / not be mangled +extern "C" { + void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz); + void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); + void ISPCSync(void *handle); +} + +/////////////////////////////////////////////////////////////////////////// +// TaskGroupBase + +#define LOG_TASK_QUEUE_CHUNK_SIZE 14 +#define MAX_TASK_QUEUE_CHUNKS 8 +#define TASK_QUEUE_CHUNK_SIZE (1<> LOG_TASK_QUEUE_CHUNK_SIZE); + int offset = index & (TASK_QUEUE_CHUNK_SIZE-1); + + if (chunk == MAX_TASK_QUEUE_CHUNKS) { + fprintf(stderr, "A total of %d tasks have been launched from the " + "current function--the simple built-in task system can handle " + "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE " + "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. " + "Sorry! Exiting.\n", index); + exit(1); + } + + if (taskInfo[chunk] == NULL) + taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE]; + return &taskInfo[chunk][offset]; +} + + +inline void * +TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) { + char *basePtr = memBuffers[curMemBuffer]; + intptr_t iptr = (intptr_t)(basePtr + curMemBufferOffset); + iptr = (iptr + (alignment-1)) & ~(alignment-1); + + int newOffset = int(iptr - (intptr_t)basePtr + size); + if (newOffset < memBufferSize[curMemBuffer]) { + curMemBufferOffset = newOffset; + return (char *)iptr; + } + + ++curMemBuffer; + curMemBufferOffset = 0; + assert(curMemBuffer < NUM_MEM_BUFFERS); + + int allocSize = 1 << (12 + curMemBuffer); + allocSize = std::max(int(size+alignment), allocSize); + char *newBuf = new char[allocSize]; + memBufferSize[curMemBuffer] = allocSize; + memBuffers[curMemBuffer] = newBuf; + return AllocMemory(size, alignment); +} + + +/////////////////////////////////////////////////////////////////////////// +// Atomics and the like + +static inline void +lMemFence() { + // Windows atomic functions already contain the fence + // KNC doesn't need the memory barrier +#if !defined ISPC_IS_KNC && !defined ISPC_IS_WINDOWS + __sync_synchronize(); +#endif +} + +static void * +lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) { +#ifdef ISPC_IS_WINDOWS + return InterlockedCompareExchangePointer(v, newValue, oldValue); +#else + void *result = __sync_val_compare_and_swap(v, oldValue, newValue); + lMemFence(); + return result; +#endif // ISPC_IS_WINDOWS +} + +static int32_t +lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) { +#ifdef ISPC_IS_WINDOWS + return InterlockedCompareExchange((volatile LONG *)v, newValue, oldValue); +#else + int32_t result = __sync_val_compare_and_swap(v, oldValue, newValue); + lMemFence(); + return result; +#endif // ISPC_IS_WINDOWS +} + +static inline int32_t +lAtomicAdd(volatile int32_t *v, int32_t delta) { +#ifdef ISPC_IS_WINDOWS + return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta; +#else + return __sync_fetch_and_add(v, delta); +#endif +} + +/////////////////////////////////////////////////////////////////////////// + +class TaskGroup : public TaskGroupBase { +public: + void Launch(int baseIndex, int count); + void Sync(); + +}; + + +/////////////////////////////////////////////////////////////////////////// +// OpenMP + +static void +InitTaskSystem() { + // No initialization needed +} + +inline void +TaskGroup::Launch(int baseIndex, int count) { +#pragma omp parallel + { + const int threadIndex = omp_get_thread_num(); + const int threadCount = omp_get_num_threads(); + + TaskInfo ti = *GetTaskInfo(baseIndex); +#pragma omp for schedule(runtime) + for(int i = 0; i < count; i++) + { + ti.taskIndex = i; + + // Actually run the task. + ti.func(ti.data, threadIndex, threadCount, ti.taskIndex, ti.taskCount(), + ti.taskIndex0(), ti.taskIndex1(), ti.taskIndex2(), + ti.taskCount0(), ti.taskCount1(), ti.taskCount2()); + } + } +} + +inline void +TaskGroup::Sync() { +} + +/////////////////////////////////////////////////////////////////////////// + +#define MAX_FREE_TASK_GROUPS 64 +static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS]; + + static inline TaskGroup * +AllocTaskGroup() +{ + for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { + TaskGroup *tg = freeTaskGroups[i]; + if (tg != NULL) { + void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg); + if (ptr != NULL) { + return (TaskGroup *)ptr; + } + } + } + + return new TaskGroup; +} + + + static inline void +FreeTaskGroup(TaskGroup *tg) +{ + tg->Reset(); + + for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { + if (freeTaskGroups[i] == NULL) { + void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL); + if (ptr == NULL) + return; + } + } + + delete tg; +} + + void +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) +{ + const int count = count0*count1*count2; + TaskGroup *taskGroup; + if (*taskGroupPtr == NULL) { + InitTaskSystem(); + taskGroup = AllocTaskGroup(); + *taskGroupPtr = taskGroup; + } + else + taskGroup = (TaskGroup *)(*taskGroupPtr); + + int baseIndex = taskGroup->AllocTaskInfo(count); + for (int i = 0; i < 1; ++i) { + TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i); + ti->func = (TaskFuncType)func; + ti->data = data; + ti->taskIndex = i; + ti->taskCount3d[0] = count0; + ti->taskCount3d[1] = count1; + ti->taskCount3d[2] = count2; + } + taskGroup->Launch(baseIndex, count); +} + + + void +ISPCSync(void *h) +{ + TaskGroup *taskGroup = (TaskGroup *)h; + if (taskGroup != NULL) { + taskGroup->Sync(); + FreeTaskGroup(taskGroup); + } +} + + + void * +ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) +{ + TaskGroup *taskGroup; + if (*taskGroupPtr == NULL) { + InitTaskSystem(); + taskGroup = AllocTaskGroup(); + *taskGroupPtr = taskGroup; + } + else + taskGroup = (TaskGroup *)(*taskGroupPtr); + + return taskGroup->AllocMemory(size, alignment); +} +