diff --git a/builtins.m4 b/builtins.m4 index 2a7cc838..cd704a0b 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -834,10 +834,9 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() -declare i8* @ISPCMalloc(i64, i32) nounwind -declare i8* @ISPCFree(i8*) nounwind -declare void @ISPCLaunch(i8*, i8*) nounwind -declare void @ISPCSync() nounwind +declare i8* @ISPCAlloc(i8**, i64, i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind +declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind declare i1 @__is_compile_time_constant_mask(<$1 x i32> %mask) diff --git a/ctx.cpp b/ctx.cpp index 20dc702a..e8234f9f 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -144,6 +144,11 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory"); StoreInst(LLVMMaskAllOff, returnedLanesPtr); + launchedTasks = false; + launchGroupHandlePtr = AllocaInst(LLVMTypes::VoidPointerType, "launch_group_handle"); + StoreInst(llvm::Constant::getNullValue(LLVMTypes::VoidPointerType), + launchGroupHandlePtr); + if (!returnType || returnType == AtomicType::Void) returnValuePtr = NULL; else { @@ -174,8 +179,6 @@ FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *functio StartScope(); } - launchedTasks = false; - // connect the funciton's mask memory to the __mask symbol Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask"); assert(maskSymbol != NULL); @@ -759,7 +762,7 @@ FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { llvm::Value * -FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) { +FunctionEmitContext::SizeOf(LLVM_TYPE_CONST llvm::Type *ty) { // Emit code to compute the size of the given type using a GEP with a // NULL base pointer, indexing one element of the given type, and // casting the resulting 'pointer' to an int giving its size. @@ -776,24 +779,7 @@ FunctionEmitContext::EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align) { #endif AddDebugPos(poffset); llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int"); - - // And given the size, call the malloc function - llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc"); - assert(fmalloc != NULL); - llvm::Value *mem = CallInst(fmalloc, sizeOf, LLVMInt32(align), - "raw_argmem"); - // Cast the void * back to the result pointer type - return BitCastInst(mem, ptrType, "mem_bitcast"); -} - - -void -FunctionEmitContext::EmitFree(llvm::Value *ptr) { - llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType, - "argmemfree"); - llvm::Function *ffree = m->module->getFunction("ISPCFree"); - assert(ffree != NULL); - CallInst(ffree, freeArg); + return sizeOf; } @@ -1912,15 +1898,9 @@ FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0, llvm::Instruction * FunctionEmitContext::ReturnInst() { - if (launchedTasks) { - // Automatically add a sync call at the end of any function that - // launched tasks - SourcePos noPos; - noPos.name = "__auto_sync"; - ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos); - es->EmitCode(this); - delete es; - } + if (launchedTasks) + // Add a sync call at the end of any function that launched tasks + SyncInst(); llvm::Instruction *rinst = NULL; if (returnValuePtr != NULL) { @@ -1943,7 +1923,8 @@ FunctionEmitContext::ReturnInst() { llvm::Instruction * FunctionEmitContext::LaunchInst(llvm::Function *callee, - std::vector &argVals) { + std::vector &argVals, + llvm::Value *launchCount) { if (callee == NULL) { assert(m->errorCount > 0); return NULL; @@ -1960,29 +1941,15 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee, static_cast(pt->getElementType()); assert(argStructType->getNumElements() == argVals.size() + 1); + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); + assert(falloc != NULL); int align = 4 * RoundUpPow2(g->target.nativeVectorWidth); - llvm::Value *argmem; -#ifdef ISPC_IS_WINDOWS - // Use malloc() to allocate storage on Windows, since the stack is - // generally not big enough there to do enough allocations for lots of - // tasks and then things crash horribly... - argmem = EmitMalloc(argStructType, align); -#else - // Otherwise, use alloca for space for the task args, ** unless we're - // compiling to AVX, in which case we use malloc after all **. (See - // http://llvm.org/bugs/show_bug.cgi?id=10841 for details. There are - // limitations in LLVM with respect to dynamic allocas of this sort - // when the stack also has to be 32-byte aligned...). - if (g->target.isa == Target::AVX) - argmem = EmitMalloc(argStructType, align); - else - // KEY DETAIL: pass false to the call of - // FunctionEmitContext::AllocaInst so that the alloca doesn't - // happen just once at the top of the function, but happens each - // time the enclosing basic block executes. - argmem = AllocaInst(argStructType, "argmem", align, false); -#endif // ISPC_IS_WINDOWS - llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType); + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(SizeOf(argStructType)); + allocArgs.push_back(LLVMInt32(align)); + llvm::Value *voidmem = CallInst(falloc, allocArgs, "args_ptr"); + llvm::Value *argmem = BitCastInst(voidmem, pt); // Copy the values of the parameters into the appropriate place in // the argument block @@ -2004,5 +1971,32 @@ FunctionEmitContext::LaunchInst(llvm::Function *callee, llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); assert(flaunch != NULL); - return CallInst(flaunch, fptr, voidmem, ""); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount); + return CallInst(flaunch, args, ""); +} + + +void +FunctionEmitContext::SyncInst() { + llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr, NULL); + llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); + llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, + llvm::CmpInst::ICMP_NE, + launchGroupHandle, nullPtrValue); + llvm::BasicBlock *bSync = CreateBasicBlock("call_sync"); + llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync"); + BranchInst(bSync, bPostSync, nonNull); + + SetCurrentBasicBlock(bSync); + llvm::Function *fsync = m->module->getFunction("ISPCSync"); + if (fsync == NULL) + FATAL("Couldn't find ISPCSync declaration?!"); + CallInst(fsync, launchGroupHandle, ""); + BranchInst(bPostSync); + + SetCurrentBasicBlock(bPostSync); } diff --git a/ctx.h b/ctx.h index 493e27e9..d211a7d6 100644 --- a/ctx.h +++ b/ctx.h @@ -210,15 +210,8 @@ public: i32. */ llvm::Value *I1VecToBoolVec(llvm::Value *b); - /** Emit code to call the user-supplied ISPCMalloc function to - allocate space for an object of thee given type. Returns the - pointer value returned by the ISPCMalloc call. */ - llvm::Value *EmitMalloc(LLVM_TYPE_CONST llvm::Type *ty, int align = 0); - - /** Emit code to call the user-supplied ISPCFree function, passing it - the given pointer to storage previously allocated by an - EmitMalloc() call. */ - void EmitFree(llvm::Value *ptr); + /** Returns the size of the given type. */ + llvm::Value *SizeOf(LLVM_TYPE_CONST llvm::Type *ty); /** If the user has asked to compile the program with instrumentation, this inserts a callback to the user-supplied instrumentation @@ -399,7 +392,10 @@ public: /** Launch an asynchronous task to run the given function, passing it he given argument values. */ llvm::Instruction *LaunchInst(llvm::Function *callee, - std::vector &argVals); + std::vector &argVals, + llvm::Value *launchCount); + + void SyncInst(); llvm::Instruction *ReturnInst(); /** @} */ @@ -489,6 +485,11 @@ private: /** True if a 'launch' statement has been encountered in the function. */ bool launchedTasks; + /** This is a pointer to a void * that is passed to the ISPCLaunch(), + ISPCAlloc(), and ISPCSync() routines as a handle to the group ot + tasks launched from the current function. */ + llvm::Value *launchGroupHandlePtr; + llvm::Value *pointerVectorToVoidPointers(llvm::Value *value); static void addGSMetadata(llvm::Instruction *inst, SourcePos pos); bool ifsInLoopAllUniform() const; diff --git a/docs/ispc.txt b/docs/ispc.txt index 89249914..ab335179 100644 --- a/docs/ispc.txt +++ b/docs/ispc.txt @@ -80,7 +80,8 @@ Contents: + `Program Instance Convergence`_ + `Data Races`_ + `Uniform Variables and Varying Control Flow`_ - + `Task Parallelism in ISPC`_ + + `Task Parallelism: Language Syntax`_ + + `Task Parallelism: Runtime Requirements`_ * `The ISPC Standard Library`_ @@ -838,8 +839,8 @@ by default. If a function is declared with a ``static`` qualifier, then it is only visible in the file in which it was declared. Any function that can be launched with the ``launch`` construct in ``ispc`` -must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more -discussion of launching tasks in ``ispc``. +must have a ``task`` qualifier; see `Task Parallelism: Language Syntax`_ +for more discussion of launching tasks in ``ispc``. Functions that are intended to be called from C/C++ application code must have the ``export`` qualifier. This causes them to have regular C linkage @@ -940,8 +941,9 @@ execution model is critical for writing efficient and correct programs in ``ispc`` supports both task parallelism to parallelize across multiple cores and SPMD parallelism to parallelize across the SIMD vector lanes on a -single core. This section focuses on SPMD parallelism. See the section -`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``. +single core. This section focuses on SPMD parallelism. See the sections +`Task Parallelism: Language Syntax`_ and `Task Parallelism: Runtime +Requirements`_ for discussion of task parallelism in ``ispc``. The SPMD-on-SIMD Execution Model -------------------------------- @@ -1384,112 +1386,190 @@ be modified in the above code even if *none* of the program instances evaluated a true value for the test, given the ``ispc`` execution model. -Task Parallelism in ISPC ------------------------- +Task Parallelism: Language Syntax +--------------------------------- One option for combining task-parallelism with ``ispc`` is to just use regular task parallelism in the C/C++ application code (be it through -Intel® Cilk(tm), Intel® Thread Building Blocks or another task system, -etc.), and for tasks to use ``ispc`` for SPMD parallelism across the vector -lanes as appropriate. Alternatively, ``ispc`` also has some support for -launching tasks from ``ispc`` code. The approach is similar to Intel® -Cilk's task launch feature. (See the ``examples/mandelbrot_tasks`` example -to see it used in a non-trivial example.) +Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and +for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as +appropriate. Alternatively, ``ispc`` also has support for launching tasks +from ``ispc`` code. The approach is similar to Intel® Cilk's task launch +feature. (See the ``examples/mandelbrot_tasks`` example to see it used in +a small example.) -Any function that is launched as a task must be declared with the ``task`` -qualifier: +First, any function that is launched as a task must be declared with the +``task`` qualifier: :: - task void func(uniform float a[], uniform int start) { - .... + task void func(uniform float a[], uniform int index) { + ... + a[index] = .... } Tasks must return ``void``; a compile time error is issued if a non-``void`` task is defined. -Given a task, one can then write code that launches tasks as follows: +Given a task definitions, there are two ways to write code that launches +tasks, using the ``launch`` construct. First, one task can be launched at +a time, with parameters passed to the task to help it determine what part +of the overall computation it's responsible for: :: for (uniform int i = 0; i < 100; ++i) - launch < func(a, i); > + launch < func(a, i) >; Note the ``launch`` keyword and the brackets around the function call. This code launches 100 tasks, each of which presumably does some -computation keyed off of given the value ``i``. In general, one should -launch many more tasks than there are processors in the system to +computation that is keyed off of given the value ``i``. In general, one +should launch many more tasks than there are processors in the system to ensure good load-balancing, but not so many that the overhead of scheduling and running tasks dominates the computation. -Program execution continues asynchronously after task launch; thus, the -function shouldn't access values being generated by the tasks without -synchronization. A function uses a ``sync`` statement to wait for all -launched tasks to finish: +Alternatively, a number of tasks may be launched from a single ``launch`` +statement. We might instead write the above example with a single +``launch`` like this: :: - for (uniform int i = 0; i < 100; ++i) - launch < func(a, i); > + launch[100] < func2(a) >; + +Where an integer value (not necessarily a compile-time constant) is +provided to the ``launch`` keyword in square brackets; this number of tasks +will be enqueued to be run asynchronously. Within each of the tasks, two +special built-in variables are available--``taskIndex``, and ``taskCount``. +The first, ``taskIndex``, ranges from zero to one minus the number of tasks +provided to ``launch``, and ``taskCount`` equals the number of launched +taks. Thus, we might use ``taskIndex`` in the implementation of ``func2`` +to determine which array element to process. + +:: + + task void func2(uniform float a[]) { + ... + a[taskIndex] = ... + } + +Program execution continues asynchronously after a ``launch`` statement; +thus, a function shouldn't access values being generated by the tasks it +has launched within the function without synchronization. If results are +needed before function return, a function can use a ``sync`` statement to +wait for all launched tasks to finish: + +:: + + launch[100] < func2(a) >; sync; // now safe to use computed values in a[]... -Alternatively, any function that launches tasks has an implicit ``sync`` -before it returns, so that functions that call a function that launches -tasks don't have to worry about outstanding asynchronous computation. +Alternatively, any function that launches tasks has an automatically-added +``sync`` statement before it returns, so that functions that call a +function that launches tasks don't have to worry about outstanding +asynchronous computation from that function. Inside functions with the ``task`` qualifier, two additional built-in -variables are provided: ``threadIndex`` and ``threadCount``. -``threadCount`` gives the total number of hardware threads that have been -launched by the task system. ``threadIndex`` provides an index between -zero and ``threadCount-1`` that gives a unique index that corresponds to -the hardware thread that is executing the current task. The -``threadIndex`` can be used for accessing data that is private to the -current thread and thus doesn't require synchronization to access under -parallel execution. +variables are provided in addition to ``taskIndex`` and ``taskCount``: +``threadIndex`` and ``threadCount``. ``threadCount`` gives the total +number of hardware threads that have been launched by the task system. +``threadIndex`` provides an index between zero and ``threadCount-1`` that +gives a unique index that corresponds to the hardware thread that is +executing the current task. The ``threadIndex`` can be used for accessing +data that is private to the current thread and thus doesn't require +synchronization to access under parallel execution. + +Task Parallelism: Runtime Requirements +-------------------------------------- If you use the task launch feature in ``ispc``, you must provide C/C++ -implementations of two functions and link them into your final executable -file. Although these functions may be implemented in either language, they -must have "C" linkage (i.e. their prototypes must be declared inside an -``extern "C"`` block if they are defined in C++.) +implementations of three specific functions that manage launching and +synchronizing parallel tasks; these functions must be linked into your +executable. Although these functions may be implemented in any +language, they must have "C" linkage (i.e. their prototypes must be +declared inside an ``extern "C"`` block if they are defined in C++.) + +By using user-supplied versions of these functions, ``ispc`` programs can +easily interoperate with software systems that have existing task systems +for managing parallelism. If you're using ``ispc`` with a system that +isn't otherwise multi-threaded and don't want to write custom +implementations of them, you can use the implementations of these functions +provided in the ``examples/tasksys.cpp`` file in the ``ispc`` +distributions. + +If you are implementing your own task system, the remainder of this section +discusses the requirements for these calls. You will also likely want to +review the example task systems in ``examples/tasksys.cpp`` for reference. +If you are not implmenting your own task system, you can skip reading the +remainder of this section. + +Here are the declarations of the three functions that must be provided to +manage tasks in ``ispc``: :: - void ISPCLaunch(void *funcptr, void *data); - void ISPCSync(); + void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); + void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void ISPCSync(void *handle); -On Windows, two additional functions must be provided to dynamically -allocate and free memory to store the arguments passed to tasks. (On OSX -and Linux, the stack provides memory for task arguments; on Windows, the -stack is generally not large enough to do this for large numbers of tasks.) +All three of these functions take an opaque handle (or a pointer to an +opaque handle) as their first parameter. This handle allows the task +system runtime to distinguish between calls to these functions from +different functions in ``ispc`` code. In this way, the task system +implementation can efficiently wait for completion on just the tasks +launched from a single function. + +The first time one of ``ISPCLaunch()`` or ``ISPCAlloc()`` is called in an +``ispc`` functon, the ``void *`` pointed to by the ``handlePtr`` parameter +will be ``NULL``. The implementations of these function should then +initialize ``*handlePtr`` to a unique handle value of some sort. (For +example, it might allocate a small structure to record which tasks were +launched by the current function.) In subsequent calls to these functions +in the emitted ``ispc`` code, the same value for ``handlePtr`` will be +passed in, such that loading from ``*handlePtr`` will retrieve the value +stored in the first call. + +At function exit (or at an explicit ``sync`` statement), a call to +``ISPCSync()`` will be generated if ``*handlePtr`` is non-``NULL``. +Therefore, the handle value is passed directly to ``ISPCSync()``, rather +than a pointer to it, as in the other functions. + +The ``ISPCAlloc()`` function is used to allocate small blocks of memory to +store parameters passed to tasks. It should return a pointer to memory +with the given aize and alignment. Note that there is no explicit +``ISPCFree()`` call; instead, all memory allocated within an ``ispc`` +function should be freed when ``ISPCSync()`` is called. + +``ISPCLaunch()`` is called to launch to launch one or more asynchronous +tasks. Each ``launch`` statement in ``ispc`` code causes a call to +``ISPCLaunch()`` to be emitted in the generated code. The three parameters +after the handle pointer to thie function are relatively straightforward; +the ``void *f`` parameter holds a pointer to a function to call to run the +work for this task, ``data`` holds a pointer to data to pass to this +function, and ``count`` is the number of instances of this function to +enqueue for asynchronous execution. (In other words, ``count`` corresponds +to the value ``n`` in a multiple-task launch statement like ``launch[n]``.) + +The signature of the provided function pointer ``f`` is :: - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); + void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount, + int taskIndex, int taskCount) -These are called by the task launch code generated by the ``ispc`` -compiler; the first is called to launch to launch a task and the second is -called to wait for, respectively. (Factoring them out in this way -allows ``ispc`` to inter-operate with the application's task system, if -any, rather than having a separate one of its own.) To run a particular -task, the task system should cast the function pointer to a ``void (*)(void -*, int, int)`` function pointer and then call it with the provided ``void -*`` data and then an index for the current hardware thread and the total -number of hardware threads the task system has launched--in other words: - -:: - - typedef void (*TaskFuncType)(void *, int, int); - TaskFuncType tft = (TaskFuncType)(funcptr); - tft(data, threadIndex, threadCount); - -A number of sample task system implementations are provided with ``ispc``; -see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and -``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of -the ``ispc`` distribution. +When this function pointer is called by one of the hardware threads managed +bythe task system, the ``data`` pointer passed to ``ISPCLaunch()`` should +be passed to it for its first parameter; ``threadCount`` gives the total +number of hardware threads that have been spawned to run tasks and +``threadIndex`` should be an integer index between zero and ``threadCount`` +uniquely identifying the hardware thread that is running the task. (These +values can be used to index into thread-local storage.) +The value of ``taskCount`` should be the number of tasks launched in the +``launch`` statement that caused the call to ``ISPCLaunch()`` and each of +the calls to this function should be given a unique value of ``taskIndex`` +between zero and ``taskCount``, to distinguish which of the instances +of the set of launched tasks is running. The ISPC Standard Library ========================= diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile index 207badbc..e8fce406 100644 --- a/examples/aobench/Makefile +++ b/examples/aobench/Makefile @@ -1,14 +1,8 @@ ARCH = $(shell uname) -TASK_CXX=../tasks_pthreads.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread - -ifeq ($(ARCH), Darwin) - TASK_CXX=../tasks_gcd.cpp - TASK_LIB= -endif - TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc index 2aca5dd4..e48a544e 100644 --- a/examples/aobench/ao.ispc +++ b/examples/aobench/ao.ispc @@ -323,16 +323,13 @@ export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, } -static void task ao_task(uniform int y0, uniform int y1, uniform int width, - uniform int height, uniform int nsubsamples, - uniform float image[]) { - ao_scanlines(y0, y1, width, height, nsubsamples, image); +static void task ao_task(uniform int width, uniform int height, + uniform int nsubsamples, uniform float image[]) { + ao_scanlines(taskIndex, taskIndex+1, width, height, nsubsamples, image); } export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { - uniform int dy = 1; - for (uniform int y = 0; y < h; y += dy) - launch < ao_task(y, y+dy, w, h, nsubsamples, image) >; + launch[h] < ao_task(w, h, nsubsamples, image) >; } diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj old mode 100755 new mode 100644 index 085c2f3e..af754e26 --- a/examples/aobench/aobench.vcxproj +++ b/examples/aobench/aobench.vcxproj @@ -21,7 +21,7 @@ - + diff --git a/examples/aobench_instrumented/aobench_instrumented.vcxproj b/examples/aobench_instrumented/aobench_instrumented.vcxproj old mode 100755 new mode 100644 diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj old mode 100755 new mode 100644 diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 718009c6..55b8b03f 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -1,14 +1,8 @@ ARCH = $(shell uname) -TASK_CXX=../tasks_pthreads.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread - -ifeq ($(ARCH), Darwin) - TASK_CXX=../tasks_gcd.cpp - TASK_LIB= -endif - TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot.cpp index ce052f6a..509ce293 100644 --- a/examples/mandelbrot_tasks/mandelbrot.cpp +++ b/examples/mandelbrot_tasks/mandelbrot.cpp @@ -101,7 +101,7 @@ ensureTargetISAIsSupported() { } static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); exit(1); } @@ -143,6 +143,9 @@ int main(int argc, char *argv[]) { // double minISPC = 1e30; for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; reset_and_start_timer(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); @@ -152,9 +155,6 @@ int main(int argc, char *argv[]) { printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); writePPM(buf, width, height, "mandelbrot-ispc.ppm"); - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; // // And run the serial implementation 3 times, again reporting the @@ -162,6 +162,9 @@ int main(int argc, char *argv[]) { // double minSerial = 1e30; for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; reset_and_start_timer(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc index df763e0a..e52725df 100644 --- a/examples/mandelbrot_tasks/mandelbrot.ispc +++ b/examples/mandelbrot_tasks/mandelbrot.ispc @@ -53,11 +53,14 @@ mandel(float c_re, float c_im, int count) { [ystart,yend). */ task void -mandelbrot_scanlines(uniform int ystart, uniform int yend, +mandelbrot_scanlines(uniform int ybase, uniform int span, uniform float x0, uniform float dx, uniform float y0, uniform float dy, uniform int width, uniform int maxIterations, reference uniform int output[]) { + uniform int ystart = ybase + taskIndex * span; + uniform int yend = ystart + span; + for (uniform int j = ystart; j < yend; ++j) { for (uniform int i = 0; i < width; i += programCount) { float x = x0 + (programIndex + i) * dx; @@ -70,6 +73,20 @@ mandelbrot_scanlines(uniform int ystart, uniform int yend, } +task void +mandelbrot_chunk(uniform float x0, uniform float dx, + uniform float y0, uniform float dy, + uniform int width, uniform int height, + uniform int maxIterations, reference uniform int output[]) { + uniform int ystart = taskIndex * (height/taskCount); + uniform int yend = (taskIndex+1) * (height/taskCount); + uniform int span = 1; + + launch[(yend-ystart)/span] < mandelbrot_scanlines(ystart, span, x0, dx, y0, dy, + width, maxIterations, output) >; +} + + export void mandelbrot_ispc(uniform float x0, uniform float y0, uniform float x1, uniform float y1, @@ -78,9 +95,6 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; - /* Launch task to compute results for spans of 'span' scanlines. */ - uniform int span = 2; - for (uniform int j = 0; j < height; j += span) - launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width, - maxIterations, output) >; + launch[32] < mandelbrot_chunk(x0, dx, y0, dy, width, height, + maxIterations, output) >; } diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj old mode 100755 new mode 100644 index 07e5f54c..bc247f4e --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -143,7 +143,7 @@ - + diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj old mode 100755 new mode 100644 index d5bf3109..03f47da6 --- a/examples/noise/noise.vcxproj +++ b/examples/noise/noise.vcxproj @@ -164,4 +164,4 @@ - \ No newline at end of file + diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj old mode 100755 new mode 100644 diff --git a/examples/rt/Makefile b/examples/rt/Makefile index d3ca759d..6d5b1e1c 100644 --- a/examples/rt/Makefile +++ b/examples/rt/Makefile @@ -1,14 +1,8 @@ ARCH = $(shell uname) -TASK_CXX=../tasks_pthreads.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread - -ifeq ($(ARCH), Darwin) - TASK_CXX=../tasks_gcd.cpp - TASK_LIB= -endif - TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ diff --git a/examples/rt/rt.ispc b/examples/rt/rt.ispc index 92ca2421..0cc1caef 100644 --- a/examples/rt/rt.ispc +++ b/examples/rt/rt.ispc @@ -283,8 +283,7 @@ export void raytrace_ispc(uniform int width, uniform int height, } -task void raytrace_tile_task(uniform int x0, uniform int x1, - uniform int y0, uniform int y1, +task void raytrace_tile_task(uniform int y0, uniform int y1, uniform int width, uniform int height, uniform int baseWidth, uniform int baseHeight, const uniform float raster2camera[4][4], @@ -292,6 +291,12 @@ task void raytrace_tile_task(uniform int x0, uniform int x1, uniform float image[], uniform int id[], const LinearBVHNode nodes[], const Triangle triangles[]) { + uniform int dx = 16; // must match dx below + uniform int xTasks = (width + (dx-1)) / dx; + uniform int x0 = (taskIndex % xTasks) * dx; + uniform int x1 = x0 + dx; + x1 = min(x1, width); + raytrace_tile(x0, x1, y0, y1, width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); @@ -306,13 +311,11 @@ export void raytrace_ispc_tasks(uniform int width, uniform int height, const LinearBVHNode nodes[], const Triangle triangles[]) { uniform int dx = 16, dy = 16; + uniform int nTasks = (width + (dx-1)) / dx; for (uniform int y = 0; y < height; y += dy) { uniform int y1 = min(y + dy, height); - for (uniform int x = 0; x < width; x += dx) { - uniform int x1 = min(x + dx, width); - launch < raytrace_tile_task(x, x1, y, y1, width, height, baseWidth, - baseHeight, raster2camera, camera2world, - image, id, nodes, triangles) >; - } + launch[nTasks] < raytrace_tile_task(y, y1, width, height, baseWidth, + baseHeight, raster2camera, camera2world, + image, id, nodes, triangles) >; } } diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj old mode 100755 new mode 100644 index 426144b4..ebd3ae28 --- a/examples/rt/rt.vcxproj +++ b/examples/rt/rt.vcxproj @@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h - + diff --git a/examples/simple/simple.vcxproj b/examples/simple/simple.vcxproj old mode 100755 new mode 100644 index 5ec98bc0..2c9caf88 --- a/examples/simple/simple.vcxproj +++ b/examples/simple/simple.vcxproj @@ -1,4 +1,4 @@ - + @@ -161,4 +161,4 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h - + diff --git a/examples/stencil/Makefile b/examples/stencil/Makefile index 7b40a489..7caf4e62 100644 --- a/examples/stencil/Makefile +++ b/examples/stencil/Makefile @@ -1,14 +1,8 @@ ARCH = $(shell uname) -TASK_CXX=../tasks_pthreads.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread - -ifeq ($(ARCH), Darwin) - TASK_CXX=../tasks_gcd.cpp - TASK_LIB= -endif - TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj old mode 100755 new mode 100644 index f045d08a..d6d130b9 --- a/examples/stencil/stencil.vcxproj +++ b/examples/stencil/stencil.vcxproj @@ -164,7 +164,7 @@ ispc -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h - + diff --git a/examples/taskinfo.h b/examples/taskinfo.h deleted file mode 100644 index 2a5daaf4..00000000 --- a/examples/taskinfo.h +++ /dev/null @@ -1,180 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifndef TASKINFO_H -#define TASKINFO_H 1 - -#ifdef _MSC_VER -#define ISPC_IS_WINDOWS -#elif defined(__linux__) -#define ISPC_IS_LINUX -#elif defined(__APPLE__) -#define ISPC_IS_APPLE -#endif - -#ifdef ISPC_IS_WINDOWS -#define NOMINMAX -#include -#include -using namespace Concurrency; -#endif // ISPC_IS_WINDOWS - -#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32) -#define ISPC_POINTER_BYTES 4 -#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64) -#define ISPC_POINTER_BYTES 8 -#else -#error "Pointer size unknown!" -#endif // __SIZEOF_POINTER__ - -#include -#include -#include -#include - -typedef struct TaskInfo { - void *func; - void *data; -#if defined(ISPC_IS_WINDOWS) - event taskEvent; -#endif -} TaskInfo; - - -#ifndef ISPC_IS_WINDOWS -static int32_t -lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) { - int32_t result; - __asm__ __volatile__("lock\ncmpxchgl %2,%1" - : "=a"(result), "=m"(*v) - : "q"(newValue), "0"(oldValue) - : "memory"); - __asm__ __volatile__("mfence":::"memory"); - return result; -} -#endif // !ISPC_IS_WINDOWS - - -static void * -lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) { -#ifdef ISPC_IS_WINDOWS - return InterlockedCompareExchangePointer(v, newValue, oldValue); -#else - void *result; -#if (ISPC_POINTER_BYTES == 4) - __asm__ __volatile__("lock\ncmpxchgd %2,%1" - : "=a"(result), "=m"(*v) - : "q"(newValue), "0"(oldValue) - : "memory"); -#else - __asm__ __volatile__("lock\ncmpxchgq %2,%1" - : "=a"(result), "=m"(*v) - : "q"(newValue), "0"(oldValue) - : "memory"); -#endif // ISPC_POINTER_BYTES - __asm__ __volatile__("mfence":::"memory"); - return result; -#endif // ISPC_IS_WINDOWS -} - - -#ifndef ISPC_IS_WINDOWS -static int32_t -lAtomicAdd32(volatile int32_t *v, int32_t delta) { - // Do atomic add with gcc x86 inline assembly - int32_t origValue; - __asm__ __volatile__("lock\n" - "xaddl %0,%1" - : "=r"(origValue), "=m"(*v) : "0"(delta) - : "memory"); - return origValue; -} -#endif - -#define LOG_TASK_QUEUE_CHUNK_SIZE 13 -#define MAX_TASK_QUEUE_CHUNKS 1024 -#define TASK_QUEUE_CHUNK_SIZE (1<> LOG_TASK_QUEUE_CHUNK_SIZE); - int offset = myCoord & (TASK_QUEUE_CHUNK_SIZE-1); - if (index == MAX_TASK_QUEUE_CHUNKS) { - fprintf(stderr, "A total of %d tasks have been launched--the simple " - "built-in task system can handle no more. Exiting.", myCoord); - exit(1); - } - - if (taskInfo[index] == NULL) { - TaskInfo *newChunk = new TaskInfo[TASK_QUEUE_CHUNK_SIZE]; - if (lAtomicCompareAndSwapPointer((void **)&taskInfo[index], newChunk, - NULL) != NULL) { - // failure--someone else got it, but that's cool - assert(taskInfo[index] != NULL); - free(newChunk); - } - } - - return &taskInfo[index][offset]; -} - - -static inline void -lResetTaskInfo() { - nextTaskInfoCoordinate = 0; -} - -#endif // TASKINFO_H diff --git a/examples/tasks_concrt.cpp b/examples/tasks_concrt.cpp deleted file mode 100644 index dac9cdae..00000000 --- a/examples/tasks_concrt.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include "taskinfo.h" - -/* Simple task system implementation for ispc based on Microsoft's - Concurrency Runtime. */ - -#include -#include -using namespace Concurrency; -#include -#include -#include -#include -#include - -// ispc expects these functions to have C linkage / not be mangled -extern "C" { - void ISPCLaunch(void *f, void *data); - void ISPCSync(); - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); -} - - -void __cdecl -lRunTask(LPVOID param) { - TaskInfo *ti = (TaskInfo *)param; - - // Actually run the task. - // FIXME: like the GCD implementation for OS X, this is passing bogus - // values for the threadIndex and threadCount builtins, which in turn - // will cause bugs in code that uses those. - int threadIndex = 0; - int threadCount = 1; - TaskFuncType func = (TaskFuncType)ti->func; - func(ti->data, threadIndex, threadCount); - - // Signal the event that this task is done - ti->taskEvent.set(); -} - - -void -ISPCLaunch(void *func, void *data) { - TaskInfo *ti = lGetTaskInfo(); - ti->func = (TaskFuncType)func; - ti->data = data; - ti->taskEvent.reset(); - CurrentScheduler::ScheduleTask(lRunTask, ti); -} - - -void ISPCSync() { - for (int i = 0; i < nextTaskInfoCoordinate; ++i) { - int index = (i >> LOG_TASK_QUEUE_CHUNK_SIZE); - int offset = i & (TASK_QUEUE_CHUNK_SIZE-1); - taskInfo[index][offset].taskEvent.wait(); - taskInfo[index][offset].taskEvent.reset(); - } - - lResetTaskInfo(); -} - - -void *ISPCMalloc(int64_t size, int32_t alignment) { - return _aligned_malloc(size, alignment); -} - - -void ISPCFree(void *ptr) { - _aligned_free(ptr); -} diff --git a/examples/tasks_gcd.cpp b/examples/tasks_gcd.cpp deleted file mode 100644 index 16c871f0..00000000 --- a/examples/tasks_gcd.cpp +++ /dev/null @@ -1,126 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#include "taskinfo.h" - -#if defined(_WIN32) || defined(_WIN64) -#define ISPC_IS_WINDOWS -#elif defined(__linux__) -#define ISPC_IS_LINUX -#elif defined(__APPLE__) -#define ISPC_IS_APPLE -#endif - -/* A simple task system for ispc programs based on Apple's Grand Central - Dispatch. */ -#include -#include -#include -#include - -static int initialized = 0; -static volatile int32_t lock = 0; -static dispatch_queue_t gcdQueue; -static dispatch_group_t gcdGroup; - -// ispc expects these functions to have C linkage / not be mangled -extern "C" { - void ISPCLaunch(void *f, void *data); - void ISPCSync(); - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); -} - - -static void -lRunTask(void *ti) { - TaskInfo *taskInfo = (TaskInfo *)ti; - // FIXME: these are bogus values; may cause bugs in code that depends - // on them having unique values in different threads. - int threadIndex = 0; - int threadCount = 1; - TaskFuncType func = (TaskFuncType)(taskInfo->func); - - // Actually run the task - func(taskInfo->data, threadIndex, threadCount); -} - - -void ISPCLaunch(void *func, void *data) { - if (!initialized) { - while (1) { - if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { - if (!initialized) { - gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); - gcdGroup = dispatch_group_create(); - lInitTaskInfo(); - __asm__ __volatile__("mfence":::"memory"); - initialized = 1; - } - lock = 0; - break; - } - } - } - - TaskInfo *ti = lGetTaskInfo(); - ti->func = func; - ti->data = data; - dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask); -} - - -void ISPCSync() { - if (!initialized) - return; - - // Wait for all of the tasks in the group to complete before returning - dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER); - - lResetTaskInfo(); -} - -void *ISPCMalloc(int64_t size, int32_t alignment) { - void *mem = malloc(size + (alignment-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & - (alignment - 1))); - ((void**)amem)[-1] = mem; - return amem; -} - - -void ISPCFree(void *ptr) { - free(((void**)ptr)[-1]); -} - diff --git a/examples/tasks_pthreads.cpp b/examples/tasks_pthreads.cpp deleted file mode 100644 index 90cdccd0..00000000 --- a/examples/tasks_pthreads.cpp +++ /dev/null @@ -1,339 +0,0 @@ -/* - Copyright (c) 2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#if defined(_WIN32) || defined(_WIN64) -#define ISPC_IS_WINDOWS -#elif defined(__linux__) -#define ISPC_IS_LINUX -#elif defined(__APPLE__) -#define ISPC_IS_APPLE -#endif - -#include "taskinfo.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef ISPC_IS_LINUX -#include -#endif - -static int initialized = 0; -static volatile int32_t lock = 0; - -static int nThreads; -static pthread_t *threads; -static pthread_mutex_t taskQueueMutex; -static int nextTaskToRun; -static sem_t *workerSemaphore; -static uint32_t numUnfinishedTasks; -static pthread_mutex_t tasksRunningConditionMutex; -static pthread_cond_t tasksRunningCondition; - -// ispc expects these functions to have C linkage / not be mangled -extern "C" { - void ISPCLaunch(void *f, void *data); - void ISPCSync(); - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); -} - -static void *lTaskEntry(void *arg); - -/** Figure out how many CPU cores there are in the system - */ -static int -lNumCPUCores() { - return sysconf(_SC_NPROCESSORS_ONLN); -} - - -static void -lTasksInit() { - nThreads = lNumCPUCores(); - - threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t)); - - int err; - if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) { - fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); - exit(1); - } - - char name[32]; - sprintf(name, "ispc_task.%d", (int)getpid()); - workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0); - if (!workerSemaphore) { - fprintf(stderr, "Error creating semaphore: %s\n", strerror(err)); - exit(1); - } - - if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) { - fprintf(stderr, "Error creating condition variable: %s\n", strerror(err)); - exit(1); - } - - if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) { - fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); - exit(1); - } - - for (int i = 0; i < nThreads; ++i) { - err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i)); - if (err != 0) { - fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err)); - exit(1); - } - } -} - - -void -ISPCLaunch(void *f, void *d) { - int err; - - if (!initialized) { - while (1) { - if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { - if (!initialized) { - lTasksInit(); - __asm__ __volatile__("mfence":::"memory"); - initialized = 1; - } - lock = 0; - break; - } - } - } - - // - // Acquire mutex, add task - // - if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - // Need a mutex here to ensure we get this filled in before a worker - // grabs it and starts running... - TaskInfo *ti = lGetTaskInfo(); - ti->func = f; - ti->data = d; - - if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); - exit(1); - } - - // - // Update count of number of tasks left to run - // - if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - // FIXME: is this redundant with nextTaskInfoCoordinate? - ++numUnfinishedTasks; - - if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - // - // Post to the worker semaphore to wake up worker threads that are - // sleeping waiting for tasks to show up - // - if ((err = sem_post(workerSemaphore)) != 0) { - fprintf(stderr, "Error from sem_post: %s\n", strerror(err)); - exit(1); - } -} - - -static void * -lTaskEntry(void *arg) { - int threadIndex = (int)((int64_t)arg); - int threadCount = nThreads; - TaskFuncType func; - - while (1) { - int err; - if ((err = sem_wait(workerSemaphore)) != 0) { - fprintf(stderr, "Error from sem_wait: %s\n", strerror(err)); - exit(1); - } - - // - // Acquire mutex, get task - // - if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - if (nextTaskToRun == nextTaskInfoCoordinate) { - // - // Task queue is empty, go back and wait on the semaphore - // - if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); - exit(1); - } - continue; - } - - int runCoord = nextTaskToRun++; - int index = (runCoord >> LOG_TASK_QUEUE_CHUNK_SIZE); - int offset = runCoord & (TASK_QUEUE_CHUNK_SIZE-1); - TaskInfo *myTask = &taskInfo[index][offset]; - - if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); - exit(1); - } - - // - // Do work for _myTask_ - // - func = (TaskFuncType)myTask->func; - func(myTask->data, threadIndex, threadCount); - - // - // Decrement the number of unfinished tasks counter - // - if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - // FIXME: can this be a comparison of (nextTaskToRun == nextTaskInfoCoordinate)? - // (I don't think so--think there is a race...) - int unfinished = --numUnfinishedTasks; - if (unfinished == 0) { - // - // Signal the "no more tasks are running" condition if all of - // them are done. - // - int err; - if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) { - fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err)); - exit(1); - } - } - - if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - } - - pthread_exit(NULL); - return 0; -} - - -void ISPCSync() { - int err; - if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } - - // As long as there are tasks running, wait on the condition variable; - // doing so causes this thread to go to sleep until someone signals on - // the tasksRunningCondition condition variable. - while (numUnfinishedTasks > 0) { - if ((err = pthread_cond_wait(&tasksRunningCondition, - &tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err)); - exit(1); - } - } - - lResetTaskInfo(); - nextTaskToRun = 0; - - // We acquire ownership of the condition variable mutex when the above - // pthread_cond_wait returns. - // FIXME: is there a lurking issue here if numUnfinishedTasks gets back - // to zero by the time we get to ISPCSync() and thence we're trying to - // unlock a mutex we don't have a lock on? - if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { - fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); - exit(1); - } -} - - -void *ISPCMalloc(int64_t size, int32_t alignment) { -#ifdef ISPC_IS_WINDOWS - return _aligned_malloc(size, alignment); -#endif -#ifdef ISPC_IS_LINUX - return memalign(alignment, size); -#endif -#ifdef ISPC_IS_APPLE - void *mem = malloc(size + (alignment-1) + sizeof(void*)); - char *amem = ((char*)mem) + sizeof(void*); - amem = amem + uint32_t(alignment - (reinterpret_cast(amem) & - (alignment - 1))); - ((void**)amem)[-1] = mem; - return amem; -#endif -} - - -void ISPCFree(void *ptr) { -#ifdef ISPC_IS_WINDOWS - _aligned_free(ptr); -#endif -#ifdef ISPC_IS_LINUX - free(ptr); -#endif -#ifdef ISPC_IS_APPLE - free(((void**)ptr)[-1]); -#endif -} - diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp new file mode 100644 index 00000000..fb970b59 --- /dev/null +++ b/examples/tasksys.cpp @@ -0,0 +1,868 @@ +/* + Copyright (c) 2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + This file implements simple task systems that provide the three + entrypoints used by ispc-generated to code to handle 'launch' and 'sync' + statements in ispc programs. See the section "Task Parallelism: Language + Syntax" in the ispc documentation for information about using task + parallelism in ispc programs, and see the section "Task Parallelism: + Runtime Requirements" for information about the task-related entrypoints + that are implemented here. + + There are three task systems in this file: one built using Microsoft's + Concurrency Runtime, one built with Apple's Grand Central Dispatch, and + one built on top of bare pthreads. +*/ + +#if defined(_WIN32) || defined(_WIN64) + #define ISPC_IS_WINDOWS + #define ISPC_USE_CONCRT +#elif defined(__linux__) + #define ISPC_IS_LINUX + #define ISPC_USE_PTHREADS +#elif defined(__APPLE__) + #define ISPC_IS_APPLE + // pthreads is noticably more efficient than GCD on OSX + #define ISPC_USE_PTHREADS + //#define ISPC_USE_GCD +#endif + +#define DBG(x) + +#ifdef ISPC_IS_WINDOWS + #define NOMINMAX + #include +#endif // ISPC_IS_WINDOWS +#ifdef ISPC_USE_CONCRT + #include + using namespace Concurrency; +#endif // ISPC_USE_CONCRT +#ifdef ISPC_USE_GCD + #include + #include +#endif // ISPC_USE_GCD +#ifdef ISPC_USE_PTHREADS + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include +#endif // ISPC_USE_PTHREADS +#ifdef ISPC_IS_LINUX + #include +#endif // ISPC_IS_LINUX +#include +#include +#include +#include +#include +#include + +// Signature of ispc-generated 'task' functions +typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, + int taskIndex, int taskCount); + +// Small structure used to hold the data for each task +struct TaskInfo { + TaskFuncType func; + void *data; + int taskIndex, taskCount; +#if defined(ISPC_IS_WINDOWS) + event taskEvent; +#endif +}; + +/////////////////////////////////////////////////////////////////////////// +// TaskGroupBase + +#define LOG_TASK_QUEUE_CHUNK_SIZE 12 +#define MAX_TASK_QUEUE_CHUNKS 8 +#define TASK_QUEUE_CHUNK_SIZE (1<> LOG_TASK_QUEUE_CHUNK_SIZE); + int offset = index & (TASK_QUEUE_CHUNK_SIZE-1); + + if (chunk == MAX_TASK_QUEUE_CHUNKS) { + fprintf(stderr, "A total of %d tasks have been launched from the " + "current function--the simple built-in task system can handle " + "no more. You can increase the values of TASK_QUEUE_CHUNK_SIZE " + "and LOG_TASK_QUEUE_CHUNK_SIZE to work around this limitation. " + "Sorry! Exiting.\n", index); + exit(1); + } + + if (taskInfo[chunk] == NULL) + taskInfo[chunk] = new TaskInfo[TASK_QUEUE_CHUNK_SIZE]; + return &taskInfo[chunk][offset]; +} + + +inline void * +TaskGroupBase::AllocMemory(int64_t size, int32_t alignment) { + char *basePtr = memBuffers[curMemBuffer]; + int64_t iptr = (int64_t)(basePtr + curMemBufferOffset); + iptr = (iptr + (alignment-1)) & ~(alignment-1); + + int newOffset = int(iptr + size - (int64_t)basePtr); + if (newOffset < memBufferSize[curMemBuffer]) { + curMemBufferOffset = newOffset; + return (char *)iptr; + } + + ++curMemBuffer; + curMemBufferOffset = 0; + assert(curMemBuffer < NUM_MEM_BUFFERS); + + int allocSize = 1 << (12 + curMemBuffer); + allocSize = std::max(int(size+alignment), allocSize); + char *newBuf = new char[allocSize]; + memBufferSize[curMemBuffer] = allocSize; + memBuffers[curMemBuffer] = newBuf; + return AllocMemory(size, alignment); +} + + +/////////////////////////////////////////////////////////////////////////// +// Atomics and the like + +#ifndef ISPC_IS_WINDOWS +static inline void +lMemFence() { + __asm__ __volatile__("mfence":::"memory"); +} +#endif // !ISPC_IS_WINDOWS + + +#if (__SIZEOF_POINTER__ == 4) || defined(__i386__) || defined(_WIN32) +#define ISPC_POINTER_BYTES 4 +#elif (__SIZEOF_POINTER__ == 8) || defined(__x86_64__) || defined(__amd64__) || defined(_WIN64) +#define ISPC_POINTER_BYTES 8 +#else +#error "Pointer size unknown!" +#endif // __SIZEOF_POINTER__ + + +static void * +lAtomicCompareAndSwapPointer(void **v, void *newValue, void *oldValue) { +#ifdef ISPC_IS_WINDOWS + return InterlockedCompareExchangePointer(v, newValue, oldValue); +#else + void *result; +#if (ISPC_POINTER_BYTES == 4) + __asm__ __volatile__("lock\ncmpxchgd %2,%1" + : "=a"(result), "=m"(*v) + : "q"(newValue), "0"(oldValue) + : "memory"); +#else + __asm__ __volatile__("lock\ncmpxchgq %2,%1" + : "=a"(result), "=m"(*v) + : "q"(newValue), "0"(oldValue) + : "memory"); +#endif // ISPC_POINTER_BYTES + lMemFence(); + return result; +#endif // ISPC_IS_WINDOWS +} + + + +#ifndef ISPC_IS_WINDOWS +static int32_t +lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue) { + int32_t result; + __asm__ __volatile__("lock\ncmpxchgl %2,%1" + : "=a"(result), "=m"(*v) + : "q"(newValue), "0"(oldValue) + : "memory"); + lMemFence(); + return result; +} +#endif // !ISPC_IS_WINDOWS + + +/////////////////////////////////////////////////////////////////////////// + +#ifdef ISPC_USE_CONCRT +// With ConcRT, we don't need to extend TaskGroupBase at all. +class TaskGroup : public TaskGroupBase { +public: + void Launch(int baseIndex, int count); + void Sync(); +}; +#endif // ISPC_USE_CONCRT + +#ifdef ISPC_USE_GCD +/* With Grand Central Dispatch, we associate a GCD dispatch group with each + task group. (We'll later wait on this dispatch group when we need to + wait on all of the tasks in the group to finish.) + */ +class TaskGroup : public TaskGroupBase { +public: + TaskGroup() { + gcdGroup = dispatch_group_create(); + } + + void Launch(int baseIndex, int count); + void Sync(); + +private: + dispatch_group_t gcdGroup; +}; +#endif // ISPC_USE_GCD + +#ifdef ISPC_USE_PTHREADS +static void *lTaskEntry(void *arg); + +class TaskGroup : public TaskGroupBase { +public: + TaskGroup() { + numUnfinishedTasks = 0; + waitingTasks.reserve(128); + inActiveList = false; + } + + void Reset() { + TaskGroupBase::Reset(); + numUnfinishedTasks = 0; + assert(inActiveList == false); + lMemFence(); + } + + void Launch(int baseIndex, int count); + void Sync(); + +private: + friend void *lTaskEntry(void *arg); + + int32_t numUnfinishedTasks; + int32_t pad[3]; + std::vector waitingTasks; + bool inActiveList; +}; + +#endif // ISPC_USE_PTHREADS + + +/////////////////////////////////////////////////////////////////////////// +// Grand Central Dispatch + +#ifdef ISPC_USE_GCD + +/* A simple task system for ispc programs based on Apple's Grand Central + Dispatch. */ + +static dispatch_queue_t gcdQueue; +static volatile int32_t lock = 0; + +static void +InitTaskSystem() { + if (gcdQueue != NULL) + return; + + while (1) { + if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { + if (gcdQueue == NULL) { + gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + assert(gcdQueue != NULL); + lMemFence(); + } + lock = 0; + break; + } + } +} + + +static void +lRunTask(void *ti) { + TaskInfo *taskInfo = (TaskInfo *)ti; + // FIXME: these are bogus values; may cause bugs in code that depends + // on them having unique values in different threads. + int threadIndex = 0; + int threadCount = 1; + + // Actually run the task + taskInfo->func(taskInfo->data, threadIndex, threadCount, + taskInfo->taskIndex, taskInfo->taskCount); +} + + +inline void +TaskGroup::Launch(int baseIndex, int count) { + for (int i = 0; i < count; ++i) { + TaskInfo *ti = GetTaskInfo(baseIndex + i); + dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask); + } +} + + +inline void +TaskGroup::Sync() { + dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER); +} + +#endif // ISPC_USE_GCD + +/////////////////////////////////////////////////////////////////////////// +// Concurrency Runtime + +#ifdef ISPC_USE_CONCRT + +static void +InitTaskSystem() { + // No initialization needed +} + + +static void __cdecl +lRunTask(LPVOID param) { + TaskInfo *ti = (TaskInfo *)param; + + // Actually run the task. + // FIXME: like the GCD implementation for OS X, this is passing bogus + // values for the threadIndex and threadCount builtins, which in turn + // will cause bugs in code that uses those. + int threadIndex = 0; + int threadCount = 1; + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + + // Signal the event that this task is done + ti->taskEvent.set(); +} + + +inline void +TaskGroup::Launch(int baseIndex, int count) { + for (int i = 0; i < count; ++i) + CurrentScheduler::ScheduleTask(lRunTask, GetTaskInfo(baseIndex + i)); +} + + +inline void +TaskGroup::Sync() { + for (int i = 0; i < nextTaskInfoIndex; ++i) { + TaskInfo *ti = GetTaskInfo(i); + ti->taskEvent.wait(); + ti->taskEvent.reset(); + } +} + +#endif // ISPC_USE_CONCRT + +/////////////////////////////////////////////////////////////////////////// +// pthreads + +#ifdef ISPC_USE_PTHREADS + +static volatile int32_t lock = 0; + +static int nThreads; +static pthread_t *threads = NULL; + +static pthread_mutex_t taskSysMutex; +static std::vector activeTaskGroups; +static sem_t *workerSemaphore; + + +static inline int32_t +lAtomicAdd(int32_t *v, int32_t delta) { + int32_t origValue; + __asm__ __volatile__("lock\n" + "xaddl %0,%1" + : "=r"(origValue), "=m"(*v) : "0"(delta) + : "memory"); + return origValue; +} + + +static void * +lTaskEntry(void *arg) { + int threadIndex = (int)((int64_t)arg); + int threadCount = nThreads; + + while (1) { + int err; + // + // Wait on the semaphore until we're woken up due to the arrival of + // more work. + // + if ((err = sem_wait(workerSemaphore)) != 0) { + fprintf(stderr, "Error from sem_wait: %s\n", strerror(err)); + exit(1); + } + + // + // Acquire the mutex + // + if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + if (activeTaskGroups.size() == 0) { + // + // Task queue is empty, go back and wait on the semaphore + // + if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + continue; + } + + // + // Get the last task group on the active list and the last task + // from its waiting tasks list. + // + TaskGroup *tg = activeTaskGroups.back(); + assert(tg->waitingTasks.size() > 0); + int taskNumber = tg->waitingTasks.back(); + tg->waitingTasks.pop_back(); + + if (tg->waitingTasks.size() == 0) { + // We just took the last task from this task group, so remove + // it from the active list. + activeTaskGroups.pop_back(); + tg->inActiveList = false; + } + + if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + + // + // And now actually run the task + // + DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); + TaskInfo *myTask = tg->GetTaskInfo(taskNumber); + myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, + myTask->taskCount); + + // + // Decrement the "number of unfinished tasks" counter in the task + // group. + // + lMemFence(); + lAtomicAdd(&tg->numUnfinishedTasks, -1); + } + + pthread_exit(NULL); + return 0; +} + + +static void +InitTaskSystem() { + if (threads == NULL) { + while (1) { + if (lAtomicCompareAndSwap32(&lock, 1, 0) == 0) { + if (threads == NULL) { + // We launch one fewer thread than there are cores, + // since the main thread here will also grab jobs from + // the task queue itself. + nThreads = sysconf(_SC_NPROCESSORS_ONLN) - 1; + + int err; + if ((err = pthread_mutex_init(&taskSysMutex, NULL)) != 0) { + fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); + exit(1); + } + + char name[32]; + sprintf(name, "ispc_task.%d", (int)getpid()); + workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0); + if (!workerSemaphore) { + fprintf(stderr, "Error creating semaphore: %s\n", strerror(err)); + exit(1); + } + + threads = (pthread_t *)malloc(nThreads * sizeof(pthread_t)); + for (int i = 0; i < nThreads; ++i) { + err = pthread_create(&threads[i], NULL, &lTaskEntry, (void *)(i)); + if (err != 0) { + fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err)); + exit(1); + } + } + + activeTaskGroups.reserve(64); + } + + // Make sure all of the above goes to memory before we + // clear the lock. + lMemFence(); + lock = 0; + break; + } + } + } +} + + +inline void +TaskGroup::Launch(int baseCoord, int count) { + // + // Acquire mutex, add task + // + int err; + if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + // Add the corresponding set of tasks to the waiting-to-be-run list for + // this task group. + // + // FIXME: it's a little ugly to hold a global mutex for this when we + // only need to make sure no one else is accessing this task group's + // waitingTasks list. (But a small experiment in switching to a + // per-TaskGroup mutex showed worse performance!) + for (int i = 0; i < count; ++i) + waitingTasks.push_back(baseCoord + i); + + // Add the task group to the global active list if it isn't there + // already. + if (inActiveList == false) { + activeTaskGroups.push_back(this); + inActiveList = true; + } + + if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + + // + // Update the count of the number of tasks left to run in this task + // group. + // + lMemFence(); + lAtomicAdd(&numUnfinishedTasks, count); + + // + // Post to the worker semaphore to wake up worker threads that are + // sleeping waiting for tasks to show up + // + for (int i = 0; i < count; ++i) + if ((err = sem_post(workerSemaphore)) != 0) { + fprintf(stderr, "Error from sem_post: %s\n", strerror(err)); + exit(1); + } +} + + +inline void +TaskGroup::Sync() { + DBG(fprintf(stderr, "syncing %p - %d unfinished\n", tg, numUnfinishedTasks)); + + while (numUnfinishedTasks > 0) { + // All of the tasks in this group aren't finished yet. We'll try + // to help out here since we don't have anything else to do... + + DBG(fprintf(stderr, "while syncing %p - %d unfinished\n", tg, + numUnfinishedTasks)); + + // + // Acquire the global task system mutex to grab a task to work on + // + int err; + if ((err = pthread_mutex_lock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + TaskInfo *myTask = NULL; + TaskGroup *runtg = this; + if (waitingTasks.size() > 0) { + int taskNumber = waitingTasks.back(); + waitingTasks.pop_back(); + + if (waitingTasks.size() == 0) { + // There's nothing left to start running from this group, + // so remove it from the active task list. + activeTaskGroups.erase(std::find(activeTaskGroups.begin(), + activeTaskGroups.end(), this)); + inActiveList = false; + } + myTask = GetTaskInfo(taskNumber); + DBG(fprintf(stderr, "running task %d from group %p in sync\n", taskNumber, tg)); + } + else { + // Other threads are already working on all of the tasks in + // this group, so we can't help out by running one ourself. + // We'll try to run one from another group to make ourselves + // useful here. + if (activeTaskGroups.size() == 0) { + // No active task groups left--there's nothing for us to do. + if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + // FIXME: We basically end up busy-waiting here, which is + // extra wasteful in a world with hyperthreading. It would + // be much better to put this thread to sleep on a + // condition variable that was signaled when the last task + // in this group was finished. + sleep(0); + continue; + } + + // Get a task to run from another task group. + runtg = activeTaskGroups.back(); + assert(runtg->waitingTasks.size() > 0); + + int taskNumber = runtg->waitingTasks.back(); + runtg->waitingTasks.pop_back(); + if (runtg->waitingTasks.size() == 0) { + // There's left to start running from this group, so remove + // it from the active task list. + activeTaskGroups.pop_back(); + runtg->inActiveList = false; + } + myTask = runtg->GetTaskInfo(taskNumber); + DBG(fprintf(stderr, "running task %d from other group %p in sync\n", + taskNumber, runtg)); + } + + if ((err = pthread_mutex_unlock(&taskSysMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + + // + // Do work for _myTask_ + // + // FIXME: bogus values for thread index/thread count here as well.. + myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount); + + // + // Decrement the number of unfinished tasks counter + // + lMemFence(); + lAtomicAdd(&runtg->numUnfinishedTasks, -1); + } + DBG(fprintf(stderr, "sync for %p done!n", tg)); +} + +#endif // ISPC_USE_PTHREADS + +/////////////////////////////////////////////////////////////////////////// + +#define MAX_FREE_TASK_GROUPS 64 +static TaskGroup *freeTaskGroups[MAX_FREE_TASK_GROUPS]; + +static inline TaskGroup * +AllocTaskGroup() { + for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { + TaskGroup *tg = freeTaskGroups[i]; + if (tg != NULL) { + void *ptr = lAtomicCompareAndSwapPointer((void **)(&freeTaskGroups[i]), NULL, tg); + if (ptr != NULL) { + assert(ptr == tg); + return (TaskGroup *)ptr; + } + } + } + + return new TaskGroup; +} + + +static inline void +FreeTaskGroup(TaskGroup *tg) { + tg->Reset(); + + for (int i = 0; i < MAX_FREE_TASK_GROUPS; ++i) { + if (freeTaskGroups[i] == NULL) { + void *ptr = lAtomicCompareAndSwapPointer((void **)&freeTaskGroups[i], tg, NULL); + if (ptr == NULL) + return; + } + } + + delete tg; +} + +/////////////////////////////////////////////////////////////////////////// + +// ispc expects these functions to have C linkage / not be mangled +extern "C" { + void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); + void ISPCSync(void *handle); +} + +void +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { + TaskGroup *taskGroup; + if (*taskGroupPtr == NULL) { + InitTaskSystem(); + taskGroup = AllocTaskGroup(); + *taskGroupPtr = taskGroup; + } + else + taskGroup = (TaskGroup *)(*taskGroupPtr); + + int baseIndex = taskGroup->AllocTaskInfo(count); + for (int i = 0; i < count; ++i) { + TaskInfo *ti = taskGroup->GetTaskInfo(baseIndex+i); + ti->func = (TaskFuncType)func; + ti->data = data; + ti->taskIndex = i; + ti->taskCount = count; + } + taskGroup->Launch(baseIndex, count); +} + + +void +ISPCSync(void *h) { + TaskGroup *taskGroup = (TaskGroup *)h; + if (taskGroup != NULL) { + taskGroup->Sync(); + FreeTaskGroup(taskGroup); + } +} + + +void * +ISPCAlloc(void **taskGroupPtr, int64_t size, int32_t alignment) { + TaskGroup *taskGroup; + if (*taskGroupPtr == NULL) { + InitTaskSystem(); + taskGroup = AllocTaskGroup(); + *taskGroupPtr = taskGroup; + } + else + taskGroup = (TaskGroup *)(*taskGroupPtr); + + return taskGroup->AllocMemory(size, alignment); +} diff --git a/examples/volume_rendering/Makefile b/examples/volume_rendering/Makefile index 17880557..0458c017 100644 --- a/examples/volume_rendering/Makefile +++ b/examples/volume_rendering/Makefile @@ -1,14 +1,8 @@ ARCH = $(shell uname) -TASK_CXX=../tasks_pthreads.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread - -ifeq ($(ARCH), Darwin) - TASK_CXX=../tasks_gcd.cpp - TASK_LIB= -endif - TASK_OBJ=$(addprefix objs/, $(subst ../,, $(TASK_CXX:.cpp=.o))) CXX=g++ diff --git a/examples/volume_rendering/volume.ispc b/examples/volume_rendering/volume.ispc index 4e95ae04..905db46a 100644 --- a/examples/volume_rendering/volume.ispc +++ b/examples/volume_rendering/volume.ispc @@ -343,11 +343,20 @@ volume_tile(uniform int x0, uniform int y0, uniform int x1, task void -volume_task(uniform int x0, uniform int y0, uniform int x1, - uniform int y1, uniform float density[], uniform int nVoxels[3], +volume_task(uniform float density[], uniform int nVoxels[3], const uniform float raster2camera[4][4], const uniform float camera2world[4][4], uniform int width, uniform int height, uniform float image[]) { + uniform int dx = 8, dy = 8; // must match value in volume_ispc_tasks + uniform int xbuckets = (width + (dx-1)) / dx; + uniform int ybuckets = (height + (dy-1)) / dy; + + uniform int x0 = (taskIndex % xbuckets) * dx; + uniform int y0 = (taskIndex / ybuckets) * dy; + uniform int x1 = x0 + dx, y1 = y0 + dy; + x1 = min(x1, width); + y1 = min(y1, height); + volume_tile(x0, y0, x1, y1, density, nVoxels, raster2camera, camera2world, width, height, image); } @@ -370,9 +379,7 @@ volume_ispc_tasks(uniform float density[], uniform int nVoxels[3], uniform int width, uniform int height, uniform float image[]) { // Launch tasks to work on (dx,dy)-sized tiles of the image uniform int dx = 8, dy = 8; - for (uniform int y = 0; y < height; y += dy) - for (uniform int x = 0; x < width; x += dx) - launch < volume_task(x, y, x+dx, y+dy, density, nVoxels, - raster2camera, camera2world, width, height, - image) >; + uniform int nTasks = ((width+(dx-1))/dx) * ((height+(dy-1))/dy); + launch[nTasks] < volume_task(density, nVoxels, raster2camera, camera2world, + width, height, image) >; } diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj old mode 100755 new mode 100644 index fecf79d6..540c8421 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -143,7 +143,7 @@ - + diff --git a/expr.cpp b/expr.cpp index b36a423c..8b665ff0 100644 --- a/expr.cpp +++ b/expr.cpp @@ -1192,7 +1192,7 @@ BinaryExpr::Optimize() { Expr *rcpSymExpr = new FunctionSymbolExpr("rcp", rcpFuns, pos); ExprList *args = new ExprList(arg1, arg1->pos); Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, - arg1->pos, false); + arg1->pos); rcpCall = rcpCall->TypeCheck(); if (rcpCall == NULL) return NULL; @@ -2260,11 +2260,12 @@ FunctionCallExpr::resolveFunctionOverloads(bool exactMatchOnly) { } -FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) - : Expr(p) { +FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, + bool il, Expr *lce) + : Expr(p), isLaunch(il) { func = f; args = a; - isLaunch = il; + launchCountExpr = lce; FunctionSymbolExpr *fse = dynamic_cast(func); // Functions with names that start with "__" should only be various @@ -2400,8 +2401,12 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const { llvm::Value *retVal = NULL; ctx->SetDebugPos(pos); - if (ft->isTask) - ctx->LaunchInst(callee, argVals); + if (ft->isTask) { + assert(launchCountExpr != NULL); + llvm::Value *launchCount = launchCountExpr->GetValue(ctx); + if (launchCount != NULL) + ctx->LaunchInst(callee, argVals, launchCount); + } else { // Most of the time, the mask is passed as the last argument. this // isn't the case for things like intrinsics, builtins, and extern @@ -2477,10 +2482,21 @@ FunctionCallExpr::TypeCheck() { if (!isLaunch) Error(pos, "\"launch\" expression needed to call function " "with \"task\" qualifier."); + if (!launchCountExpr) + return NULL; + + launchCountExpr = + launchCountExpr->TypeConv(AtomicType::UniformInt32, + "task launch count"); + if (!launchCountExpr) + return NULL; + } + else { + if (isLaunch) + Error(pos, "\"launch\" expression illegal with non-\"task\"-" + "qualified function."); + assert(launchCountExpr == NULL); } - else if (isLaunch) - Error(pos, "\"launch\" expression illegal with non-\"task\"-" - "qualified function."); } else Error(pos, "Valid function name must be used for function call."); @@ -5281,14 +5297,8 @@ SyncExpr::GetType() const { llvm::Value * SyncExpr::GetValue(FunctionEmitContext *ctx) const { ctx->SetDebugPos(pos); - std::vector noArg; - llvm::Function *fsync = m->module->getFunction("ISPCSync"); - if (fsync == NULL) { - FATAL("Couldn't find ISPCSync declaration?!"); - return NULL; - } - - return ctx->CallInst(fsync, noArg, ""); + ctx->SyncInst(); + return NULL; } diff --git a/expr.h b/expr.h index 9af33f8b..4c9fd657 100644 --- a/expr.h +++ b/expr.h @@ -250,7 +250,8 @@ public: */ class FunctionCallExpr : public Expr { public: - FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch); + FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, + bool isLaunch = false, Expr *launchCountExpr = NULL); llvm::Value *GetValue(FunctionEmitContext *ctx) const; const Type *GetType() const; @@ -263,6 +264,7 @@ public: Expr *func; ExprList *args; bool isLaunch; + Expr *launchCountExpr; private: void resolveFunctionOverloads(bool exactMatchOnly); diff --git a/ispc_test.cpp b/ispc_test.cpp index d1507587..a1a6d3d3 100644 --- a/ispc_test.cpp +++ b/ispc_test.cpp @@ -98,24 +98,27 @@ extern "C" { bool shouldFail = false; extern "C" { - void ISPCLaunch(void *, void *); - void ISPCSync(); - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); + void ISPCLaunch(void **, void *, void *, int32_t); + void ISPCSync(void *); + void *ISPCAlloc(void **, int64_t size, int32_t alignment); } -void ISPCLaunch(void *func, void *data) { - typedef void (*TaskFuncType)(void *, int, int); +void ISPCLaunch(void **handle, void *func, void *data, int32_t count) { + *handle = (void *)0xdeadbeef; + typedef void (*TaskFuncType)(void *, int, int, int, int); TaskFuncType tft = (TaskFuncType)(func); - tft(data, 0, 1); + for (int i = 0; i < count; ++i) + tft(data, 0, 1, i, count); } -void ISPCSync() { +void ISPCSync(void *) { } -void *ISPCMalloc(int64_t size, int32_t alignment) { +void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) { + *handle = (void *)0xdeadbeef; + // leak time! #ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); #endif @@ -133,18 +136,6 @@ void *ISPCMalloc(int64_t size, int32_t alignment) { } -void ISPCFree(void *ptr) { -#ifdef ISPC_IS_WINDOWS - _aligned_free(ptr); -#endif -#ifdef ISPC_IS_LINUX - free(ptr); -#endif -#ifdef ISPC_IS_APPLE - free(((void**)ptr)[-1]); -#endif -} - static void usage(int ret) { fprintf(stderr, "usage: ispc_test\n"); fprintf(stderr, "\t[-h/--help]\tprint help\n"); @@ -217,8 +208,7 @@ static bool lRunTest(const char *fn) { ee->addGlobalMapping(func, (void *)FUNC) DO_FUNC(ISPCLaunch, "ISPCLaunch"); DO_FUNC(ISPCSync, "ISPCSync"); - DO_FUNC(ISPCMalloc, "ISPCMalloc"); - DO_FUNC(ISPCFree, "ISPCFree"); + DO_FUNC(ISPCAlloc, "ISPCAlloc"); DO_FUNC(putchar, "putchar"); DO_FUNC(printf, "printf"); DO_FUNC(fflush, "fflush"); diff --git a/module.cpp b/module.cpp index 068bbaee..bc8d04b7 100644 --- a/module.cpp +++ b/module.cpp @@ -627,6 +627,8 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function, llvm::Value *structParamPtr = argIter++; llvm::Value *threadIndex = argIter++; llvm::Value *threadCount = argIter++; + llvm::Value *taskIndex = argIter++; + llvm::Value *taskCount = argIter++; // Copy the function parameter values from the structure into local // storage @@ -654,18 +656,17 @@ lEmitFunctionCode(FunctionEmitContext *ctx, llvm::Function *function, threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount"); ctx->StoreInst(threadCount, threadCountSym->storagePtr); -#ifdef ISPC_IS_WINDOWS - // On Windows, we dynamically-allocate space for the task arguments - // (see FunctionEmitContext::LaunchInst().) Here is where we emit - // the code to free that memory, now that we've copied the - // parameter values out of the structure. - ctx->EmitFree(structParamPtr); -#else - // We also do this for AVX... (See discussion in - // FunctionEmitContext::LaunchInst().) - if (g->target.isa == Target::AVX) - ctx->EmitFree(structParamPtr); -#endif // ISPC_IS_WINDOWS + // Copy taskIndex and taskCount into stack-allocated storage so + // that their symbols point to something reasonable. + Symbol *taskIndexSym = m->symbolTable->LookupVariable("taskIndex"); + assert(taskIndexSym); + taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex"); + ctx->StoreInst(taskIndex, taskIndexSym->storagePtr); + + Symbol *taskCountSym = m->symbolTable->LookupVariable("taskCount"); + assert(taskCountSym); + taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); + ctx->StoreInst(taskCount, taskCountSym->storagePtr); } else { // Regular, non-task function diff --git a/parse.yy b/parse.yy index ac75075b..d51420e5 100644 --- a/parse.yy +++ b/parse.yy @@ -165,7 +165,7 @@ static const char *lParamListTokens[] = { %token TOKEN_CBREAK TOKEN_CCONTINUE TOKEN_CRETURN TOKEN_SYNC TOKEN_PRINT %type primary_expression postfix_expression -%type unary_expression cast_expression +%type unary_expression cast_expression launch_expression %type multiplicative_expression additive_expression shift_expression %type relational_expression equality_expression and_expression %type exclusive_or_expression inclusive_or_expression @@ -257,18 +257,32 @@ primary_expression | '(' expression ')' { $$ = $2; } ; +launch_expression + : TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3); + $$ = new FunctionCallExpr($3, $5, @3, true, oneExpr); + } + | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @3); + $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true, oneExpr); + } + | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>' + { $$ = new FunctionCallExpr($6, $8, @6, true, $3); } + | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>' + { $$ = new FunctionCallExpr($6, new ExprList(@6), @6, true, $3); } + ; + postfix_expression : primary_expression | postfix_expression '[' expression ']' { $$ = new IndexExpr($1, $3, @1); } | postfix_expression '(' ')' - { $$ = new FunctionCallExpr($1, new ExprList(@1), @1, false); } + { $$ = new FunctionCallExpr($1, new ExprList(@1), @1); } | postfix_expression '(' argument_expression_list ')' - { $$ = new FunctionCallExpr($1, $3, @1, false); } - | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>' - { $$ = new FunctionCallExpr($3, $5, @3, true); } - | TOKEN_LAUNCH '<' postfix_expression '(' ')' '>' - { $$ = new FunctionCallExpr($3, new ExprList(@3), @3, true); } + { $$ = new FunctionCallExpr($1, $3, @1); } + | launch_expression | postfix_expression '.' TOKEN_IDENTIFIER { $$ = MemberExpr::create($1, yytext, @1, @3); } /* | postfix_expression TOKEN_PTR_OP TOKEN_IDENTIFIER @@ -1283,6 +1297,12 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) { Symbol *threadCountSym = new Symbol("threadCount", pos, AtomicType::UniformConstUInt32); m->symbolTable->AddVariable(threadCountSym); + + Symbol *taskIndexSym = new Symbol("taskIndex", pos, AtomicType::UniformConstUInt32); + m->symbolTable->AddVariable(taskIndexSym); + + Symbol *taskCountSym = new Symbol("taskCount", pos, AtomicType::UniformConstUInt32); + m->symbolTable->AddVariable(taskCountSym); } diff --git a/test_static.cpp b/test_static.cpp index 38be15c8..06960fd2 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -58,23 +58,26 @@ extern "C" { extern void f_di(float *result, double *a, int *b); extern void result(float *val); - void ISPCLaunch(void *f, void *d); - void ISPCSync(); - void *ISPCMalloc(int64_t size, int32_t alignment); - void ISPCFree(void *ptr); + void ISPCLaunch(void **handlePtr, void *f, void *d, int); + void ISPCSync(void *handle); + void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); } -void ISPCLaunch(void *f, void *d) { - typedef void (*TaskFuncType)(void *, int, int); +void ISPCLaunch(void **handle, void *f, void *d, int count) { + *handle = (void *)0xdeadbeef; + typedef void (*TaskFuncType)(void *, int, int, int, int); TaskFuncType func = (TaskFuncType)f; - func(d, 0, 1); + for (int i = 0; i < count; ++i) + func(d, 0, 1, i, count); } -void ISPCSync() { +void ISPCSync(void *) { } -void *ISPCMalloc(int64_t size, int32_t alignment) { +void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) { + *handle = (void *)0xdeadbeef; + // and now, we leak... #ifdef ISPC_IS_WINDOWS return _aligned_malloc(size, alignment); #endif @@ -92,18 +95,6 @@ void *ISPCMalloc(int64_t size, int32_t alignment) { } -void ISPCFree(void *ptr) { -#ifdef ISPC_IS_WINDOWS - _aligned_free(ptr); -#endif -#ifdef ISPC_IS_LINUX - free(ptr); -#endif -#ifdef ISPC_IS_APPLE - free(((void**)ptr)[-1]); -#endif -} - int main(int argc, char *argv[]) { int w = width(); diff --git a/tests/launch-1.ispc b/tests/launch-1.ispc new file mode 100644 index 00000000..c8ca0207 --- /dev/null +++ b/tests/launch-1.ispc @@ -0,0 +1,26 @@ + +export uniform int width() { return programCount; } + + +static uniform float array[10000]; + +task void x(float f) { + uniform int j; + uniform int i = taskIndex; + array[i] = i / 10000.; + cfor (j = 0; j < 10000; ++j) + array[i] = sin(array[i]); + if (array[i] < .02) + array[i] = i; +} +export void f_f(uniform float RET[], uniform float fFOO[]) { + float f = fFOO[programIndex]; + launch[10000] < x(f) >; + sync; + RET[programIndex] = array[9999]; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 9999.000000; +} diff --git a/tests/cfor-test-101.ispc b/tests/launch-2.ispc similarity index 100% rename from tests/cfor-test-101.ispc rename to tests/launch-2.ispc diff --git a/tests/test-100.ispc b/tests/launch-3.ispc similarity index 100% rename from tests/test-100.ispc rename to tests/launch-3.ispc diff --git a/tests/launch-4.ispc b/tests/launch-4.ispc new file mode 100644 index 00000000..65231b5e --- /dev/null +++ b/tests/launch-4.ispc @@ -0,0 +1,18 @@ + +export uniform int width() { return programCount; } + + +static float array[6]; +task void x(uniform int i, float f) { + array[i] = f; +} +export void f_fu(uniform float RET[], uniform float fFOO[], uniform float fu) { + float f = fFOO[programIndex]; + launch[1] < x(fu, f) >; + sync; + RET[programIndex] = array[5]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1+programIndex; +} diff --git a/tests/test-101.ispc b/tests/launch-5.ispc similarity index 100% rename from tests/test-101.ispc rename to tests/launch-5.ispc diff --git a/tests/test-102.ispc b/tests/launch-6.ispc similarity index 100% rename from tests/test-102.ispc rename to tests/launch-6.ispc diff --git a/tests/launch-7.ispc b/tests/launch-7.ispc new file mode 100644 index 00000000..08a01432 --- /dev/null +++ b/tests/launch-7.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + + +static uniform float array[10]; +task void foo(uniform float f) { array[0] = f; } +task void foo(uniform float f, uniform int i) { array[i] = f; } + +export void f_v(uniform float RET[]) { + launch[1] < foo(12.) >; + launch[1] < foo(-1., 1) >; + sync; + RET[programIndex] = array[0] + array[1]; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 11.000000; +} diff --git a/type.cpp b/type.cpp index 69e6646c..2353b8e0 100644 --- a/type.cpp +++ b/type.cpp @@ -1856,6 +1856,8 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool includeMask) const { callTypes.push_back(llvm::PointerType::getUnqual(st)); callTypes.push_back(LLVMTypes::Int32Type); // threadIndex callTypes.push_back(LLVMTypes::Int32Type); // threadCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex + callTypes.push_back(LLVMTypes::Int32Type); // taskCount } else // Otherwise we already have the types of the arguments