From 78a05777bc6a6d41bfb553cdc66cdf16fc4fc507 Mon Sep 17 00:00:00 2001 From: egaburov Date: Tue, 22 Oct 2013 16:18:40 +0200 Subject: [PATCH 01/40] added taskIndex_x,y,z and taskCount_x,y,z --- func.cpp | 39 +++++++++++++++++++++++++++++++++++++++ func.h | 5 ++++- parse.yy | 17 ++++++++++++++++- type.cpp | 6 ++++++ 4 files changed, 65 insertions(+), 2 deletions(-) diff --git a/func.cpp b/func.cpp index b975049b..dea45afc 100644 --- a/func.cpp +++ b/func.cpp @@ -132,9 +132,28 @@ Function::Function(Symbol *s, Stmt *c) { Assert(taskIndexSym); taskCountSym = m->symbolTable->LookupVariable("taskCount"); Assert(taskCountSym); + + taskIndexSym_x = m->symbolTable->LookupVariable("taskIndex_x"); + Assert(taskIndexSym_x); + taskIndexSym_y = m->symbolTable->LookupVariable("taskIndex_y"); + Assert(taskIndexSym_y); + taskIndexSym_z = m->symbolTable->LookupVariable("taskIndex_z"); + Assert(taskIndexSym_z); + + + taskCountSym_x = m->symbolTable->LookupVariable("taskCount_x"); + Assert(taskCountSym_x); + taskCountSym_y = m->symbolTable->LookupVariable("taskCount_y"); + Assert(taskCountSym_y); + taskCountSym_z = m->symbolTable->LookupVariable("taskCount_z"); + Assert(taskCountSym_z); } else + { threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL; + taskIndexSym_x = taskIndexSym_y = taskIndexSym_z = NULL; + taskCountSym_x = taskCountSym_y = taskCountSym_z = NULL; + } } @@ -225,6 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, llvm::Value *threadCount = argIter++; llvm::Value *taskIndex = argIter++; llvm::Value *taskCount = argIter++; + llvm::Value *taskIndex_x = argIter++; + llvm::Value *taskIndex_y = argIter++; + llvm::Value *taskIndex_z = argIter++; + llvm::Value *taskCount_x = argIter++; + llvm::Value *taskCount_y = argIter++; + llvm::Value *taskCount_z = argIter++; // Copy the function parameter values from the structure into local // storage @@ -256,6 +281,20 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); ctx->StoreInst(taskCount, taskCountSym->storagePtr); + + taskIndexSym_x->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_x"); + ctx->StoreInst(taskIndex_x, taskIndexSym_x->storagePtr); + taskIndexSym_y->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_y"); + ctx->StoreInst(taskIndex_y, taskIndexSym_y->storagePtr); + taskIndexSym_z->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_z"); + ctx->StoreInst(taskIndex_z, taskIndexSym_z->storagePtr); + + taskCountSym_x->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_x"); + ctx->StoreInst(taskCount_x, taskCountSym_x->storagePtr); + taskCountSym_y->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_y"); + ctx->StoreInst(taskCount_y, taskCountSym_y->storagePtr); + taskCountSym_z->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_z"); + ctx->StoreInst(taskCount_z, taskCountSym_z->storagePtr); } else { // Regular, non-task function diff --git a/func.h b/func.h index ac3e1447..ee44a6c5 100644 --- a/func.h +++ b/func.h @@ -60,7 +60,10 @@ private: Stmt *code; Symbol *maskSymbol; Symbol *threadIndexSym, *threadCountSym; - Symbol *taskIndexSym, *taskCountSym; + Symbol *taskIndexSym, *taskCountSym; + Symbol *taskIndexSym_x, *taskCountSym_x; + Symbol *taskIndexSym_y, *taskCountSym_y; + Symbol *taskIndexSym_z, *taskCountSym_z; }; #endif // ISPC_FUNC_H diff --git a/parse.yy b/parse.yy index 38c5ba77..1de4644f 100644 --- a/parse.yy +++ b/parse.yy @@ -2214,9 +2214,24 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) { Symbol *taskIndexSym = new Symbol("taskIndex", pos, type); m->symbolTable->AddVariable(taskIndexSym); - + Symbol *taskCountSym = new Symbol("taskCount", pos, type); m->symbolTable->AddVariable(taskCountSym); + + Symbol *taskIndexSym_x = new Symbol("taskIndex_x", pos, type); + m->symbolTable->AddVariable(taskIndexSym_x); + Symbol *taskIndexSym_y = new Symbol("taskIndex_y", pos, type); + m->symbolTable->AddVariable(taskIndexSym_y); + Symbol *taskIndexSym_z = new Symbol("taskIndex_z", pos, type); + m->symbolTable->AddVariable(taskIndexSym_z); + + + Symbol *taskCountSym_x = new Symbol("taskCount_x", pos, type); + m->symbolTable->AddVariable(taskCountSym_x); + Symbol *taskCountSym_y = new Symbol("taskCount_y", pos, type); + m->symbolTable->AddVariable(taskCountSym_y); + Symbol *taskCountSym_z = new Symbol("taskCount_z", pos, type); + m->symbolTable->AddVariable(taskCountSym_z); } diff --git a/type.cpp b/type.cpp index 5fa1845b..d36c63c2 100644 --- a/type.cpp +++ b/type.cpp @@ -2961,6 +2961,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { callTypes.push_back(LLVMTypes::Int32Type); // threadCount callTypes.push_back(LLVMTypes::Int32Type); // taskIndex callTypes.push_back(LLVMTypes::Int32Type); // taskCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_x + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_y + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_z + callTypes.push_back(LLVMTypes::Int32Type); // taskCount_x + callTypes.push_back(LLVMTypes::Int32Type); // taskCount_y + callTypes.push_back(LLVMTypes::Int32Type); // taskCount_z } else // Otherwise we already have the types of the arguments From ade8751442d1dbd427b13eec3a59016cdd01807d Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 23 Oct 2013 08:39:17 +0200 Subject: [PATCH 02/40] taskIndex_x,y,z are passed to the task --- examples/common.mk | 8 ++++---- examples/tasksys.cpp | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/common.mk b/examples/common.mk index db7b8eee..252c1196 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -1,11 +1,11 @@ -TASK_CXX=../tasksys.cpp +TASK_CXX=../tasksys3d.cpp TASK_LIB=-lpthread -TASK_OBJ=objs/tasksys.o +TASK_OBJ=objs/tasksys3d.o -CXX=clang++ +CXX=g++ -fopenmp CXXFLAGS+=-Iobjs/ -O2 -CC=clang +CC=gcc -fopenmp CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index c9c2fa7b..d7b524a8 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -59,7 +59,9 @@ #define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_CILK +*/ #define ISPC_USE_OMP +/* #define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_PARALLEL_FOR From f89bad1e945bfeb3f19c3190d87dafcd2e75e405 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 23 Oct 2013 12:51:06 +0200 Subject: [PATCH 03/40] launch now passes the right info into tasking --- ast.cpp | 3 ++- builtins/util.m4 | 2 +- ctx.cpp | 6 ++++-- ctx.h | 2 +- expr.cpp | 31 +++++++++++++++++---------- expr.h | 5 +++-- lex.ll | 7 ++++++ parse.yy | 56 ++++++++++++++++++++++++++++++++++++++++++++---- 8 files changed, 90 insertions(+), 22 deletions(-) diff --git a/ast.cpp b/ast.cpp index 83ee207d..60b20a80 100644 --- a/ast.cpp +++ b/ast.cpp @@ -223,7 +223,8 @@ WalkAST(ASTNode *node, ASTPreCallBackFunc preFunc, ASTPostCallBackFunc postFunc, else if ((fce = dynamic_cast(node)) != NULL) { fce->func = (Expr *)WalkAST(fce->func, preFunc, postFunc, data); fce->args = (ExprList *)WalkAST(fce->args, preFunc, postFunc, data); - fce->launchCountExpr = (Expr *)WalkAST(fce->launchCountExpr, preFunc, + for (int k = 0; k < 3; k++) + fce->launchCountExpr[0] = (Expr *)WalkAST(fce->launchCountExpr[0], preFunc, postFunc, data); } else if ((ie = dynamic_cast(node)) != NULL) { diff --git a/builtins/util.m4 b/builtins/util.m4 index 11501780..c90e8adc 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1813,7 +1813,7 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind diff --git a/ctx.cpp b/ctx.cpp index c50d22f9..3aee776a 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -3502,7 +3502,7 @@ FunctionEmitContext::ReturnInst() { llvm::Value * FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector &argVals, - llvm::Value *launchCount) { + llvm::Value *launchCount[3]){ if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); return NULL; @@ -3563,7 +3563,9 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee, args.push_back(launchGroupHandlePtr); args.push_back(fptr); args.push_back(voidmem); - args.push_back(launchCount); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); return CallInst(flaunch, NULL, args, ""); } diff --git a/ctx.h b/ctx.h index 58f9aae3..4dd30053 100644 --- a/ctx.h +++ b/ctx.h @@ -542,7 +542,7 @@ public: he given argument values. */ llvm::Value *LaunchInst(llvm::Value *callee, std::vector &argVals, - llvm::Value *launchCount); + llvm::Value *launchCount[3]); void SyncInst(); diff --git a/expr.cpp b/expr.cpp index c92503e0..60d9ce66 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3540,11 +3540,13 @@ SelectExpr::Print() const { // FunctionCallExpr FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, - bool il, Expr *lce) + bool il, Expr *lce[3]) : Expr(p), isLaunch(il) { func = f; args = a; - launchCountExpr = lce; + launchCountExpr[0] = lce[0]; + launchCountExpr[1] = lce[1]; + launchCountExpr[2] = lce[2]; } @@ -3662,9 +3664,13 @@ FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const { llvm::Value *retVal = NULL; ctx->SetDebugPos(pos); if (ft->isTask) { - AssertPos(pos, launchCountExpr != NULL); - llvm::Value *launchCount = launchCountExpr->GetValue(ctx); - if (launchCount != NULL) + AssertPos(pos, launchCountExpr[0] != NULL); + llvm::Value *launchCount[3] = + { launchCountExpr[0]->GetValue(ctx), + launchCountExpr[1]->GetValue(ctx), + launchCountExpr[2]->GetValue(ctx) }; + + if (launchCount[0] != NULL) ctx->LaunchInst(callee, argVals, launchCount); } else @@ -3787,14 +3793,17 @@ FunctionCallExpr::TypeCheck() { if (!isLaunch) Error(pos, "\"launch\" expression needed to call function " "with \"task\" qualifier."); - if (!launchCountExpr) + for (int k = 0; k < 3; k++) + { + if (!launchCountExpr[k]) return NULL; - launchCountExpr = - TypeConvertExpr(launchCountExpr, AtomicType::UniformInt32, - "task launch count"); - if (launchCountExpr == NULL) + launchCountExpr[k] = + TypeConvertExpr(launchCountExpr[k], AtomicType::UniformInt32, + "task launch count"); + if (launchCountExpr[k] == NULL) return NULL; + } } else { if (isLaunch) { @@ -3802,7 +3811,7 @@ FunctionCallExpr::TypeCheck() { "qualified function."); return NULL; } - AssertPos(pos, launchCountExpr == NULL); + AssertPos(pos, launchCountExpr[0] == NULL); } } else { diff --git a/expr.h b/expr.h index f8b96abd..0d46191b 100644 --- a/expr.h +++ b/expr.h @@ -246,7 +246,8 @@ public: class FunctionCallExpr : public Expr { public: FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, - bool isLaunch = false, Expr *launchCountExpr = NULL); + bool isLaunch = false, + Expr *launchCountExpr[3] = (Expr*[3]){NULL, NULL, NULL}); llvm::Value *GetValue(FunctionEmitContext *ctx) const; llvm::Value *GetLValue(FunctionEmitContext *ctx) const; @@ -261,7 +262,7 @@ public: Expr *func; ExprList *args; bool isLaunch; - Expr *launchCountExpr; + Expr *launchCountExpr[3]; }; diff --git a/lex.ll b/lex.ll index 87a80145..b5db747d 100644 --- a/lex.ll +++ b/lex.ll @@ -76,6 +76,7 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, + TOKEN_TRIPLECHEVRON_OPEN, TOKEN_TRIPLECHEVRON_CLOSE, TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, @@ -151,6 +152,8 @@ void ParserInit() { tokenToName[TOKEN_WHILE] = "while"; tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; + tokenToName[TOKEN_TRIPLECHEVRON_OPEN] = "<<<"; + tokenToName[TOKEN_TRIPLECHEVRON_CLOSE] = ">>>"; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; @@ -266,6 +269,8 @@ void ParserInit() { tokenNameRemap["TOKEN_WHILE"] = "\'while\'"; tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; + tokenNameRemap["TOKEN_TRIPLECHEVRON_OPEN"] = "\'<<<\'"; + tokenNameRemap["TOKEN_TRIPLECHEVRON_CLOSE"] = "\'>>>\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; @@ -418,6 +423,8 @@ void { RT; return TOKEN_VOID; } while { RT; return TOKEN_WHILE; } \"C\" { RT; return TOKEN_STRING_C_LITERAL; } \.\.\. { RT; return TOKEN_DOTDOTDOT; } +\<\<\< { RT; return TOKEN_TRIPLECHEVRON_OPEN; } +\>\>\> { RT; return TOKEN_TRIPLECHEVRON_CLOSE; } "operator*" { return TOKEN_IDENTIFIER; } "operator+" { return TOKEN_IDENTIFIER; } diff --git a/parse.yy b/parse.yy index 1de4644f..dfb50134 100644 --- a/parse.yy +++ b/parse.yy @@ -204,6 +204,7 @@ struct ForeachDimension { %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH %token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH TOKEN_FOREACH TOKEN_FOREACH_TILED %token TOKEN_FOREACH_UNIQUE TOKEN_FOREACH_ACTIVE TOKEN_DOTDOTDOT +%token TOKEN_TRIPLECHEVRON_OPEN TOKEN_TRIPLECHEVRON_CLOSE %token TOKEN_FOR TOKEN_GOTO TOKEN_CONTINUE TOKEN_BREAK TOKEN_RETURN %token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE %token TOKEN_SYNC TOKEN_PRINT TOKEN_ASSERT @@ -353,17 +354,64 @@ launch_expression : TOKEN_LAUNCH postfix_expression '(' argument_expression_list ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2); - $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, oneExpr); + Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($2, $4, Union(@2, @5), true, launchCount); } | TOKEN_LAUNCH postfix_expression '(' ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @2); - $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, oneExpr); + Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, launchCount); } | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' argument_expression_list ')' - { $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, $3); } + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount); + } + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount); + } | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' ')' - { $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, $3); } + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount); + } + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); + Expr *launchCount[3] = {$3, oneExpr, oneExpr}; + $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount); + } + + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); + Expr *launchCount[3] = {$3, $5, oneExpr}; + $$ = new FunctionCallExpr($7, $9, Union(@7,@10), true, launchCount); + } + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); + Expr *launchCount[3] = {$3, $5, oneExpr}; + $$ = new FunctionCallExpr($7, new ExprList(Union(@7,@8)), Union(@7,@9), true, launchCount); + } + + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' + { + Expr *launchCount[3] = {$3, $5, $7}; + $$ = new FunctionCallExpr($9, $11, Union(@9,@12), true, launchCount); + } + | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + { + Expr *launchCount[3] = {$3, $5, $7}; + $$ = new FunctionCallExpr($9, new ExprList(Union(@9,@10)), Union(@9,@11), true, launchCount); + } + | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>' { From e6c8765891af519b3b10607d90dde27ae16a53f4 Mon Sep 17 00:00:00 2001 From: egaburov Date: Wed, 23 Oct 2013 13:18:22 +0200 Subject: [PATCH 04/40] fixed tasksys.cpp for 3d tasking --- examples/common.mk | 8 +- examples/mandelbrot_tasks3d/.gitignore | 2 + examples/mandelbrot_tasks3d/Makefile | 8 + .../mandelbrot_tasks.vcxproj | 180 ++++++++++++++++++ .../mandelbrot_tasks3d/mandelbrot_tasks3d.cpp | 146 ++++++++++++++ .../mandelbrot_tasks3d.ispc | 100 ++++++++++ .../mandelbrot_tasks_serial.cpp | 68 +++++++ examples/tasksys.cpp | 62 ++++-- 8 files changed, 557 insertions(+), 17 deletions(-) create mode 100644 examples/mandelbrot_tasks3d/.gitignore create mode 100644 examples/mandelbrot_tasks3d/Makefile create mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj create mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp create mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc create mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp diff --git a/examples/common.mk b/examples/common.mk index 252c1196..db7b8eee 100644 --- a/examples/common.mk +++ b/examples/common.mk @@ -1,11 +1,11 @@ -TASK_CXX=../tasksys3d.cpp +TASK_CXX=../tasksys.cpp TASK_LIB=-lpthread -TASK_OBJ=objs/tasksys3d.o +TASK_OBJ=objs/tasksys.o -CXX=g++ -fopenmp +CXX=clang++ CXXFLAGS+=-Iobjs/ -O2 -CC=gcc -fopenmp +CC=clang CCFLAGS+=-Iobjs/ -O2 LIBS=-lm $(TASK_LIB) -lstdc++ diff --git a/examples/mandelbrot_tasks3d/.gitignore b/examples/mandelbrot_tasks3d/.gitignore new file mode 100644 index 00000000..c2471c27 --- /dev/null +++ b/examples/mandelbrot_tasks3d/.gitignore @@ -0,0 +1,2 @@ +mandelbrot +*.ppm diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile new file mode 100644 index 00000000..3dd44d65 --- /dev/null +++ b/examples/mandelbrot_tasks3d/Makefile @@ -0,0 +1,8 @@ + +EXAMPLE=mandelbrot_tasks3d +CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp +ISPC_SRC=mandelbrot_tasks3d.ispc +ISPC_IA_TARGETS=avx,sse2,sse4 +ISPC_ARM_TARGETS=neon + +include ../common.mk diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj new file mode 100644 index 00000000..3a8fca79 --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj @@ -0,0 +1,180 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E80DA7D4-AB22-4648-A068-327307156BE6} + Win32Proj + mandelbrot_tasks + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + true + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + false + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + false + $(ProjectDir)..\..;$(ExecutablePath) + mandelbrot_tasks + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + true + Fast + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(TargetDir) + Fast + + + Console + true + true + true + + + + + + + + + + Document + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 + + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 + + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 + + ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 + + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h + + + + + + diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp new file mode 100644 index 00000000..9cbb966a --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp @@ -0,0 +1,146 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include "../timing.h" +#include "mandelbrot_tasks3d_ispc.h" +using namespace ispc; + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); + printf("Wrote image file %s\n", fn); +} + + +static void usage() { + fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + exit(1); +} + +int main(int argc, char *argv[]) { + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + if (argc == 1) + ; + else if (argc == 2) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + if (scale == 0.f) + usage(); + width *= scale; + height *= scale; + // round up to multiples of 16 + width = (width + 0xf) & ~0xf; + height = (height + 0xf) & ~0xf; + } + else + usage(); + } + else + usage(); + + int maxIterations = 512; + int *buf = new int[width*height]; + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc new file mode 100644 index 00000000..60473a7f --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -0,0 +1,100 @@ +/* + Copyright (c) 2010-2012, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline int +mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + unmasked { + z_re = c_re + new_re; + z_im = c_im + new_im; + } + } + + return i; +} + + +/* Task to compute the Mandelbrot iterations for a single scanline. + */ +task void +mandelbrot_scanline(uniform float x0, uniform float dx, + uniform float y0, uniform float dy, + uniform int width, uniform int height, + uniform int xspan, uniform int yspan, + uniform int maxIterations, uniform int output[]) { +#if 0 + print("taskIndex = % : % \n", taskIndex); + print("taskIndex_x= % : % \n", taskIndex_x); + print("taskIndex_y= % : % \n", taskIndex_y); + print(" --- \n"); +#endif + const uniform int xstart = taskIndex_x * xspan; + const uniform int xend = min(xstart + xspan, width); + + const uniform int ystart = taskIndex_y * yspan; + const uniform int yend = min(ystart + yspan, height); + + + foreach (yi = ystart ... yend, xi = xstart ... xend) { + float x = x0 + xi * dx; + float y = y0 + yi * dy; + + int index = yi * width + xi; + output[index] = mandel(x, y, maxIterations); + } + +} + +#if 1 +export void +mandelbrot_ispc(uniform float x0, uniform float y0, + uniform float x1, uniform float y1, + uniform int width, uniform int height, + uniform int maxIterations, uniform int output[]) { + uniform float dx = (x1 - x0) / width; + uniform float dy = (y1 - y0) / height; + const uniform int xspan = 16; + const uniform int yspan = 16; + + launch <<>> + mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, + maxIterations, output); +} +#endif diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp new file mode 100644 index 00000000..a76fb5ca --- /dev/null +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp @@ -0,0 +1,68 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static int mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.f) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + +void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]) +{ + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; ++i) { + float x = x0 + i * dx; + float y = y0 + j * dy; + + int index = (j * width + i); + output[index] = mandel(x, y, maxIterations); + } + } +} + diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index d7b524a8..fed368dc 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -59,9 +59,7 @@ #define ISPC_USE_PTHREADS #define ISPC_USE_PTHREADS_FULLY_SUBSCRIBED #define ISPC_USE_CILK -*/ #define ISPC_USE_OMP -/* #define ISPC_USE_TBB_TASK_GROUP #define ISPC_USE_TBB_PARALLEL_FOR @@ -172,21 +170,39 @@ // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, - int taskIndex, int taskCount); + int taskIndex, int taskCount, + int taskIndex_x, int taskIndex_y, int taskIndex_z, + int taskCount_x, int taskCount_y, int taskCount_z); // Small structure used to hold the data for each task struct TaskInfo { TaskFuncType func; void *data; int taskIndex, taskCount; + int taskCount3d[3]; #if defined(ISPC_IS_WINDOWS) event taskEvent; #endif + int taskIndex_x() const + { + return taskIndex % taskCount3d[0]; + } + int taskIndex_y() const + { + return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; + } + int taskIndex_z() const + { + return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); + } + int taskCount_x() const { return taskCount3d[0]; } + int taskCount_y() const { return taskCount3d[1]; } + int taskCount_z() const { return taskCount3d[2]; } }; // ispc expects these functions to have C linkage / not be mangled extern "C" { - void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void ISPCLaunch(void **handlePtr, void *f, void *data, int countx,int county, int countz); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void ISPCSync(void *handle); } @@ -520,7 +536,9 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, - taskInfo->taskIndex, taskInfo->taskCount); + taskInfo->taskIndex, taskInfo->taskCount, + taskInfo->taskIndex_x(), taskInfo->taskIndex_y(), taskInfo->taskIndex_z(), + taskInfo->taskCount_x(), taskInfo->taskCount_y(), taskInfo->taskCount_z()); } @@ -561,7 +579,9 @@ lRunTask(LPVOID param) { // will cause bugs in code that uses those. int threadIndex = 0; int threadCount = 1; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), + ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); // Signal the event that this task is done ti->taskEvent.set(); @@ -662,7 +682,9 @@ lTaskEntry(void *arg) { DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, - myTask->taskCount); + myTask->taskCount, + myTask->taskIndex_x(), myTask->taskIndex_y(), myTask->taskIndex_z(), + myTask->taskCount_x(), myTask->taskCount_y(), myTask->taskCount_z()); // // Decrement the "number of unfinished tasks" counter in the task @@ -863,7 +885,9 @@ TaskGroup::Sync() { // Do work for _myTask_ // // FIXME: bogus values for thread index/thread count here as well.. - myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount); + myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount, + myTask->taskIndex_x(), myTask->taskIndex_y(), myTask->taskIndex_z(), + myTask->taskCount_x(), myTask->taskCount_y(), myTask->taskCount_z()); // // Decrement the number of unfinished tasks counter @@ -893,7 +917,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 - ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount, + ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), + ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); } } @@ -922,7 +948,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), + ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); } } @@ -953,7 +981,9 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), + ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); }); } @@ -980,7 +1010,9 @@ TaskGroup::Launch(int baseIndex, int count) { // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), + ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); }); } } @@ -1033,7 +1065,8 @@ FreeTaskGroup(TaskGroup *tg) { /////////////////////////////////////////////////////////////////////////// void -ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int countx, int county, int countz) { + const int count = countx*county*countz; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); @@ -1050,6 +1083,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { ti->data = data; ti->taskIndex = i; ti->taskCount = count; + ti->taskCount3d[0] = countx; + ti->taskCount3d[1] = county; + ti->taskCount3d[2] = countz; } taskGroup->Launch(baseIndex, count); } From 43761173ec8f653531d73b346ff2d190f9206dba Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 24 Oct 2013 13:16:23 +0200 Subject: [PATCH 05/40] changed notation, task[Index,Count]_[x,y,z] -> task[Index,Count][1,2,3]. Change launch <<< nx,ny,nz >>> into launch [nx,ny,nz] or equivalent launch [nz][ny][nx]. Programmer can pick the one the is liked the most --- .../mandelbrot_tasks3d.ispc | 19 +++-- examples/tasksys.cpp | 58 +++++++------- func.cpp | 64 ++++++++-------- func.h | 6 +- lex.ll | 7 -- parse.yy | 76 +++++++++++-------- type.cpp | 12 +-- 7 files changed, 122 insertions(+), 120 deletions(-) diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc index 60473a7f..8bdf6f7a 100644 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -59,16 +59,10 @@ mandelbrot_scanline(uniform float x0, uniform float dx, uniform int width, uniform int height, uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { -#if 0 - print("taskIndex = % : % \n", taskIndex); - print("taskIndex_x= % : % \n", taskIndex_x); - print("taskIndex_y= % : % \n", taskIndex_y); - print(" --- \n"); -#endif - const uniform int xstart = taskIndex_x * xspan; + const uniform int xstart = taskIndex1 * xspan; const uniform int xend = min(xstart + xspan, width); - const uniform int ystart = taskIndex_y * yspan; + const uniform int ystart = taskIndex2 * yspan; const uniform int yend = min(ystart + yspan, height); @@ -90,10 +84,15 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform int maxIterations, uniform int output[]) { uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; - const uniform int xspan = 16; + const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */ const uniform int yspan = 16; - launch <<>> + +#if 1 + launch [width/xspan, height/yspan] +#else + launch [height/yspan][width/xspan] +#endif mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, maxIterations, output); } diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index fed368dc..5ef72ed9 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -171,8 +171,8 @@ // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, int taskIndex, int taskCount, - int taskIndex_x, int taskIndex_y, int taskIndex_z, - int taskCount_x, int taskCount_y, int taskCount_z); + int taskIndex1, int taskIndex2, int taskIndex3, + int taskCount1, int taskCount2, int taskCount3); // Small structure used to hold the data for each task struct TaskInfo { @@ -183,21 +183,21 @@ struct TaskInfo { #if defined(ISPC_IS_WINDOWS) event taskEvent; #endif - int taskIndex_x() const + int taskIndex1() const { return taskIndex % taskCount3d[0]; } - int taskIndex_y() const + int taskIndex2() const { return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; } - int taskIndex_z() const + int taskIndex3() const { return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); } - int taskCount_x() const { return taskCount3d[0]; } - int taskCount_y() const { return taskCount3d[1]; } - int taskCount_z() const { return taskCount3d[2]; } + int taskCount1() const { return taskCount3d[0]; } + int taskCount2() const { return taskCount3d[1]; } + int taskCount3() const { return taskCount3d[2]; } }; // ispc expects these functions to have C linkage / not be mangled @@ -537,8 +537,8 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, taskInfo->taskIndex, taskInfo->taskCount, - taskInfo->taskIndex_x(), taskInfo->taskIndex_y(), taskInfo->taskIndex_z(), - taskInfo->taskCount_x(), taskInfo->taskCount_y(), taskInfo->taskCount_z()); + taskInfo->taskIndex1(), taskInfo->taskIndex2(), taskInfo->taskIndex3(), + taskInfo->taskCount1(), taskInfo->taskCount2(), taskInfo->taskCount3()); } @@ -580,8 +580,8 @@ lRunTask(LPVOID param) { int threadIndex = 0; int threadCount = 1; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, - ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), - ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); + ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), + ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); // Signal the event that this task is done ti->taskEvent.set(); @@ -683,8 +683,8 @@ lTaskEntry(void *arg) { TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, myTask->taskCount, - myTask->taskIndex_x(), myTask->taskIndex_y(), myTask->taskIndex_z(), - myTask->taskCount_x(), myTask->taskCount_y(), myTask->taskCount_z()); + myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), + myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); // // Decrement the "number of unfinished tasks" counter in the task @@ -886,8 +886,8 @@ TaskGroup::Sync() { // // FIXME: bogus values for thread index/thread count here as well.. myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount, - myTask->taskIndex_x(), myTask->taskIndex_y(), myTask->taskIndex_z(), - myTask->taskCount_x(), myTask->taskCount_y(), myTask->taskCount_z()); + myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), + myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); // // Decrement the number of unfinished tasks counter @@ -918,8 +918,8 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount, - ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), - ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); + ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), + ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); } } @@ -949,8 +949,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, - ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), - ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); + ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), + ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); } } @@ -982,8 +982,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadCount = ti->taskCount; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, - ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), - ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); + ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), + ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); }); } @@ -1011,8 +1011,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, - ti->taskIndex_x(), ti->taskIndex_y(), ti->taskIndex_z(), - ti->taskCount_x(), ti->taskCount_y(), ti->taskCount_z()); + ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), + ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); }); } } @@ -1065,8 +1065,8 @@ FreeTaskGroup(TaskGroup *tg) { /////////////////////////////////////////////////////////////////////////// void -ISPCLaunch(void **taskGroupPtr, void *func, void *data, int countx, int county, int countz) { - const int count = countx*county*countz; +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count1, int count2, int count3) { + const int count = count1*count2*count3; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); @@ -1083,9 +1083,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int countx, int county, ti->data = data; ti->taskIndex = i; ti->taskCount = count; - ti->taskCount3d[0] = countx; - ti->taskCount3d[1] = county; - ti->taskCount3d[2] = countz; + ti->taskCount3d[0] = count1; + ti->taskCount3d[1] = count2; + ti->taskCount3d[2] = count3; } taskGroup->Launch(baseIndex, count); } diff --git a/func.cpp b/func.cpp index dea45afc..086be6fe 100644 --- a/func.cpp +++ b/func.cpp @@ -133,26 +133,26 @@ Function::Function(Symbol *s, Stmt *c) { taskCountSym = m->symbolTable->LookupVariable("taskCount"); Assert(taskCountSym); - taskIndexSym_x = m->symbolTable->LookupVariable("taskIndex_x"); - Assert(taskIndexSym_x); - taskIndexSym_y = m->symbolTable->LookupVariable("taskIndex_y"); - Assert(taskIndexSym_y); - taskIndexSym_z = m->symbolTable->LookupVariable("taskIndex_z"); - Assert(taskIndexSym_z); + taskIndexSym1 = m->symbolTable->LookupVariable("taskIndex1"); + Assert(taskIndexSym1); + taskIndexSym2 = m->symbolTable->LookupVariable("taskIndex2"); + Assert(taskIndexSym2); + taskIndexSym3 = m->symbolTable->LookupVariable("taskIndex3"); + Assert(taskIndexSym3); - taskCountSym_x = m->symbolTable->LookupVariable("taskCount_x"); - Assert(taskCountSym_x); - taskCountSym_y = m->symbolTable->LookupVariable("taskCount_y"); - Assert(taskCountSym_y); - taskCountSym_z = m->symbolTable->LookupVariable("taskCount_z"); - Assert(taskCountSym_z); + taskCountSym1 = m->symbolTable->LookupVariable("taskCount1"); + Assert(taskCountSym1); + taskCountSym2 = m->symbolTable->LookupVariable("taskCount2"); + Assert(taskCountSym2); + taskCountSym3 = m->symbolTable->LookupVariable("taskCount3"); + Assert(taskCountSym3); } else { threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL; - taskIndexSym_x = taskIndexSym_y = taskIndexSym_z = NULL; - taskCountSym_x = taskCountSym_y = taskCountSym_z = NULL; + taskIndexSym1 = taskIndexSym2 = taskIndexSym3 = NULL; + taskCountSym1 = taskCountSym2 = taskCountSym3 = NULL; } } @@ -244,12 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, llvm::Value *threadCount = argIter++; llvm::Value *taskIndex = argIter++; llvm::Value *taskCount = argIter++; - llvm::Value *taskIndex_x = argIter++; - llvm::Value *taskIndex_y = argIter++; - llvm::Value *taskIndex_z = argIter++; - llvm::Value *taskCount_x = argIter++; - llvm::Value *taskCount_y = argIter++; - llvm::Value *taskCount_z = argIter++; + llvm::Value *taskIndex1 = argIter++; + llvm::Value *taskIndex2 = argIter++; + llvm::Value *taskIndex3 = argIter++; + llvm::Value *taskCount1 = argIter++; + llvm::Value *taskCount2 = argIter++; + llvm::Value *taskCount3 = argIter++; // Copy the function parameter values from the structure into local // storage @@ -282,19 +282,19 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); ctx->StoreInst(taskCount, taskCountSym->storagePtr); - taskIndexSym_x->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_x"); - ctx->StoreInst(taskIndex_x, taskIndexSym_x->storagePtr); - taskIndexSym_y->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_y"); - ctx->StoreInst(taskIndex_y, taskIndexSym_y->storagePtr); - taskIndexSym_z->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex_z"); - ctx->StoreInst(taskIndex_z, taskIndexSym_z->storagePtr); + taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1"); + ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr); + taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2"); + ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr); + taskIndexSym3->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex3"); + ctx->StoreInst(taskIndex3, taskIndexSym3->storagePtr); - taskCountSym_x->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_x"); - ctx->StoreInst(taskCount_x, taskCountSym_x->storagePtr); - taskCountSym_y->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_y"); - ctx->StoreInst(taskCount_y, taskCountSym_y->storagePtr); - taskCountSym_z->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount_z"); - ctx->StoreInst(taskCount_z, taskCountSym_z->storagePtr); + taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1"); + ctx->StoreInst(taskCount1, taskCountSym1->storagePtr); + taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2"); + ctx->StoreInst(taskCount2, taskCountSym2->storagePtr); + taskCountSym3->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount3"); + ctx->StoreInst(taskCount3, taskCountSym3->storagePtr); } else { // Regular, non-task function diff --git a/func.h b/func.h index ee44a6c5..4181bba5 100644 --- a/func.h +++ b/func.h @@ -61,9 +61,9 @@ private: Symbol *maskSymbol; Symbol *threadIndexSym, *threadCountSym; Symbol *taskIndexSym, *taskCountSym; - Symbol *taskIndexSym_x, *taskCountSym_x; - Symbol *taskIndexSym_y, *taskCountSym_y; - Symbol *taskIndexSym_z, *taskCountSym_z; + Symbol *taskIndexSym1, *taskCountSym1; + Symbol *taskIndexSym2, *taskCountSym2; + Symbol *taskIndexSym3, *taskCountSym3; }; #endif // ISPC_FUNC_H diff --git a/lex.ll b/lex.ll index b5db747d..87a80145 100644 --- a/lex.ll +++ b/lex.ll @@ -76,7 +76,6 @@ static int allTokens[] = { TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED, TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE, TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT, - TOKEN_TRIPLECHEVRON_OPEN, TOKEN_TRIPLECHEVRON_CLOSE, TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT, TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT, TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT, @@ -152,8 +151,6 @@ void ParserInit() { tokenToName[TOKEN_WHILE] = "while"; tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\""; tokenToName[TOKEN_DOTDOTDOT] = "..."; - tokenToName[TOKEN_TRIPLECHEVRON_OPEN] = "<<<"; - tokenToName[TOKEN_TRIPLECHEVRON_CLOSE] = ">>>"; tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT"; tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT"; tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT"; @@ -269,8 +266,6 @@ void ParserInit() { tokenNameRemap["TOKEN_WHILE"] = "\'while\'"; tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\""; tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'"; - tokenNameRemap["TOKEN_TRIPLECHEVRON_OPEN"] = "\'<<<\'"; - tokenNameRemap["TOKEN_TRIPLECHEVRON_CLOSE"] = "\'>>>\'"; tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant"; tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant"; tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant"; @@ -423,8 +418,6 @@ void { RT; return TOKEN_VOID; } while { RT; return TOKEN_WHILE; } \"C\" { RT; return TOKEN_STRING_C_LITERAL; } \.\.\. { RT; return TOKEN_DOTDOTDOT; } -\<\<\< { RT; return TOKEN_TRIPLECHEVRON_OPEN; } -\>\>\> { RT; return TOKEN_TRIPLECHEVRON_CLOSE; } "operator*" { return TOKEN_IDENTIFIER; } "operator+" { return TOKEN_IDENTIFIER; } diff --git a/parse.yy b/parse.yy index dfb50134..653bba62 100644 --- a/parse.yy +++ b/parse.yy @@ -204,7 +204,6 @@ struct ForeachDimension { %token TOKEN_CASE TOKEN_DEFAULT TOKEN_IF TOKEN_ELSE TOKEN_SWITCH %token TOKEN_WHILE TOKEN_DO TOKEN_LAUNCH TOKEN_FOREACH TOKEN_FOREACH_TILED %token TOKEN_FOREACH_UNIQUE TOKEN_FOREACH_ACTIVE TOKEN_DOTDOTDOT -%token TOKEN_TRIPLECHEVRON_OPEN TOKEN_TRIPLECHEVRON_CLOSE %token TOKEN_FOR TOKEN_GOTO TOKEN_CONTINUE TOKEN_BREAK TOKEN_RETURN %token TOKEN_CIF TOKEN_CDO TOKEN_CFOR TOKEN_CWHILE %token TOKEN_SYNC TOKEN_PRINT TOKEN_ASSERT @@ -363,54 +362,65 @@ launch_expression Expr *launchCount[3] = {oneExpr, oneExpr, oneExpr}; $$ = new FunctionCallExpr($2, new ExprList(Union(@3,@4)), Union(@2, @4), true, launchCount); } - | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' argument_expression_list ')' + + | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); Expr *launchCount[3] = {$3, oneExpr, oneExpr}; $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount); } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' - { - ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); - Expr *launchCount[3] = {$3, oneExpr, oneExpr}; - $$ = new FunctionCallExpr($5, $7, Union(@5,@8), true, launchCount); - } - | TOKEN_LAUNCH '[' expression ']' postfix_expression '(' ')' - { - ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); - Expr *launchCount[3] = {$3, oneExpr, oneExpr}; - $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount); - } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + | TOKEN_LAUNCH '[' assignment_expression ']' postfix_expression '(' ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @5); Expr *launchCount[3] = {$3, oneExpr, oneExpr}; $$ = new FunctionCallExpr($5, new ExprList(Union(@5,@6)), Union(@5,@7), true, launchCount); } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); Expr *launchCount[3] = {$3, $5, oneExpr}; $$ = new FunctionCallExpr($7, $9, Union(@7,@10), true, launchCount); } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')' { ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @7); Expr *launchCount[3] = {$3, $5, oneExpr}; $$ = new FunctionCallExpr($7, new ExprList(Union(@7,@8)), Union(@7,@9), true, launchCount); } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8); + Expr *launchCount[3] = {$6, $3, oneExpr}; + $$ = new FunctionCallExpr($8, $10, Union(@8,@11), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')' + { + ConstExpr *oneExpr = new ConstExpr(AtomicType::UniformInt32, (int32_t)1, @8); + Expr *launchCount[3] = {$6, $3, oneExpr}; + $$ = new FunctionCallExpr($8, new ExprList(Union(@8,@9)), Union(@8,@10), true, launchCount); + } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' argument_expression_list ')' + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' argument_expression_list ')' { Expr *launchCount[3] = {$3, $5, $7}; $$ = new FunctionCallExpr($9, $11, Union(@9,@12), true, launchCount); } - | TOKEN_LAUNCH TOKEN_TRIPLECHEVRON_OPEN assignment_expression ',' assignment_expression ',' assignment_expression TOKEN_TRIPLECHEVRON_CLOSE postfix_expression '(' ')' + | TOKEN_LAUNCH '[' assignment_expression ',' assignment_expression ',' assignment_expression ']' postfix_expression '(' ')' { Expr *launchCount[3] = {$3, $5, $7}; $$ = new FunctionCallExpr($9, new ExprList(Union(@9,@10)), Union(@9,@11), true, launchCount); } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' argument_expression_list ')' + { + Expr *launchCount[3] = {$9, $6, $3}; + $$ = new FunctionCallExpr($11, $13, Union(@11,@14), true, launchCount); + } + | TOKEN_LAUNCH '[' assignment_expression ']' '[' assignment_expression ']' '[' assignment_expression ']' postfix_expression '(' ')' + { + Expr *launchCount[3] = {$9, $6, $3}; + $$ = new FunctionCallExpr($11, new ExprList(Union(@11,@12)), Union(@11,@13), true, launchCount); + } | TOKEN_LAUNCH '<' postfix_expression '(' argument_expression_list ')' '>' @@ -425,13 +435,13 @@ launch_expression "around function call expression."); $$ = NULL; } - | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' argument_expression_list ')' '>' + | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' argument_expression_list ')' '>' { Error(Union(@5, @10), "\"launch\" expressions no longer take '<' '>' " "around function call expression."); $$ = NULL; } - | TOKEN_LAUNCH '[' expression ']' '<' postfix_expression '(' ')' '>' + | TOKEN_LAUNCH '[' assignment_expression ']' '<' postfix_expression '(' ')' '>' { Error(Union(@5, @9), "\"launch\" expressions no longer take '<' '>' " "around function call expression."); @@ -2266,20 +2276,20 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) { Symbol *taskCountSym = new Symbol("taskCount", pos, type); m->symbolTable->AddVariable(taskCountSym); - Symbol *taskIndexSym_x = new Symbol("taskIndex_x", pos, type); - m->symbolTable->AddVariable(taskIndexSym_x); - Symbol *taskIndexSym_y = new Symbol("taskIndex_y", pos, type); - m->symbolTable->AddVariable(taskIndexSym_y); - Symbol *taskIndexSym_z = new Symbol("taskIndex_z", pos, type); - m->symbolTable->AddVariable(taskIndexSym_z); + Symbol *taskIndexSym1 = new Symbol("taskIndex1", pos, type); + m->symbolTable->AddVariable(taskIndexSym1); + Symbol *taskIndexSym2 = new Symbol("taskIndex2", pos, type); + m->symbolTable->AddVariable(taskIndexSym2); + Symbol *taskIndexSym3 = new Symbol("taskIndex3", pos, type); + m->symbolTable->AddVariable(taskIndexSym3); - Symbol *taskCountSym_x = new Symbol("taskCount_x", pos, type); - m->symbolTable->AddVariable(taskCountSym_x); - Symbol *taskCountSym_y = new Symbol("taskCount_y", pos, type); - m->symbolTable->AddVariable(taskCountSym_y); - Symbol *taskCountSym_z = new Symbol("taskCount_z", pos, type); - m->symbolTable->AddVariable(taskCountSym_z); + Symbol *taskCountSym1 = new Symbol("taskCount1", pos, type); + m->symbolTable->AddVariable(taskCountSym1); + Symbol *taskCountSym2 = new Symbol("taskCount2", pos, type); + m->symbolTable->AddVariable(taskCountSym2); + Symbol *taskCountSym3 = new Symbol("taskCount3", pos, type); + m->symbolTable->AddVariable(taskCountSym3); } diff --git a/type.cpp b/type.cpp index d36c63c2..3ae0cab4 100644 --- a/type.cpp +++ b/type.cpp @@ -2961,12 +2961,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { callTypes.push_back(LLVMTypes::Int32Type); // threadCount callTypes.push_back(LLVMTypes::Int32Type); // taskIndex callTypes.push_back(LLVMTypes::Int32Type); // taskCount - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_x - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_y - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex_z - callTypes.push_back(LLVMTypes::Int32Type); // taskCount_x - callTypes.push_back(LLVMTypes::Int32Type); // taskCount_y - callTypes.push_back(LLVMTypes::Int32Type); // taskCount_z + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2 + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex3 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount1 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount2 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount3 } else // Otherwise we already have the types of the arguments From c5fc47cc1959cf8fb4b170a6b378614c811591a7 Mon Sep 17 00:00:00 2001 From: egaburov Date: Thu, 24 Oct 2013 14:09:46 +0200 Subject: [PATCH 06/40] tasksys cleaning --- examples/tasksys.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index 5ef72ed9..4c85e119 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -178,11 +178,12 @@ typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, struct TaskInfo { TaskFuncType func; void *data; - int taskIndex, taskCount; + int taskIndex; int taskCount3d[3]; #if defined(ISPC_IS_WINDOWS) event taskEvent; #endif + int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } int taskIndex1() const { return taskIndex % taskCount3d[0]; @@ -198,7 +199,8 @@ struct TaskInfo { int taskCount1() const { return taskCount3d[0]; } int taskCount2() const { return taskCount3d[1]; } int taskCount3() const { return taskCount3d[2]; } -}; + TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } +} __attribute__((aligned(32))); // ispc expects these functions to have C linkage / not be mangled extern "C" { @@ -536,7 +538,7 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, - taskInfo->taskIndex, taskInfo->taskCount, + taskInfo->taskIndex, taskInfo->taskCount(), taskInfo->taskIndex1(), taskInfo->taskIndex2(), taskInfo->taskIndex3(), taskInfo->taskCount1(), taskInfo->taskCount2(), taskInfo->taskCount3()); } @@ -579,7 +581,7 @@ lRunTask(LPVOID param) { // will cause bugs in code that uses those. int threadIndex = 0; int threadCount = 1; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); @@ -682,7 +684,7 @@ lTaskEntry(void *arg) { DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, - myTask->taskCount, + myTask->taskCount(), myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); @@ -885,7 +887,7 @@ TaskGroup::Sync() { // Do work for _myTask_ // // FIXME: bogus values for thread index/thread count here as well.. - myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount, + myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(), myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); @@ -917,7 +919,7 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 - ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount, + ti->func(ti->data, ti->taskIndex, ti->taskCount(), ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); } @@ -948,7 +950,7 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); } @@ -981,7 +983,7 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); }); @@ -1010,7 +1012,7 @@ TaskGroup::Launch(int baseIndex, int count) { // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount, + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); }); @@ -1082,7 +1084,6 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count1, int count2, ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = i; - ti->taskCount = count; ti->taskCount3d[0] = count1; ti->taskCount3d[1] = count2; ti->taskCount3d[2] = count3; From 383e804ec1e591370899babf1c1aa549995e8bae Mon Sep 17 00:00:00 2001 From: Evghenii Date: Thu, 24 Oct 2013 17:20:56 +0200 Subject: [PATCH 07/40] changed notation form taskIndex1,2,3 -> taskIndex0,1,2 --- .../mandelbrot_tasks3d.ispc | 4 +- examples/tasksys.cpp | 58 +++++++++---------- func.cpp | 24 ++++---- func.h | 2 +- parse.yy | 8 +-- type.cpp | 4 +- 6 files changed, 50 insertions(+), 50 deletions(-) diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc index 8bdf6f7a..395bdca4 100644 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc @@ -59,10 +59,10 @@ mandelbrot_scanline(uniform float x0, uniform float dx, uniform int width, uniform int height, uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { - const uniform int xstart = taskIndex1 * xspan; + const uniform int xstart = taskIndex0 * xspan; const uniform int xend = min(xstart + xspan, width); - const uniform int ystart = taskIndex2 * yspan; + const uniform int ystart = taskIndex1 * yspan; const uniform int yend = min(ystart + yspan, height); diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index 4c85e119..6bc60129 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -171,8 +171,8 @@ // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, int taskIndex, int taskCount, - int taskIndex1, int taskIndex2, int taskIndex3, - int taskCount1, int taskCount2, int taskCount3); + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task struct TaskInfo { @@ -184,21 +184,21 @@ struct TaskInfo { event taskEvent; #endif int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } - int taskIndex1() const + int taskIndex0() const { return taskIndex % taskCount3d[0]; } - int taskIndex2() const + int taskIndex1() const { return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; } - int taskIndex3() const + int taskIndex2() const { return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); } - int taskCount1() const { return taskCount3d[0]; } - int taskCount2() const { return taskCount3d[1]; } - int taskCount3() const { return taskCount3d[2]; } + int taskCount0() const { return taskCount3d[0]; } + int taskCount1() const { return taskCount3d[1]; } + int taskCount2() const { return taskCount3d[2]; } TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } } __attribute__((aligned(32))); @@ -539,8 +539,8 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, taskInfo->taskIndex, taskInfo->taskCount(), - taskInfo->taskIndex1(), taskInfo->taskIndex2(), taskInfo->taskIndex3(), - taskInfo->taskCount1(), taskInfo->taskCount2(), taskInfo->taskCount3()); + taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(), + taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2()); } @@ -582,8 +582,8 @@ lRunTask(LPVOID param) { int threadIndex = 0; int threadCount = 1; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), - ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), - ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); // Signal the event that this task is done ti->taskEvent.set(); @@ -685,8 +685,8 @@ lTaskEntry(void *arg) { TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, myTask->taskCount(), - myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), - myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the "number of unfinished tasks" counter in the task @@ -888,8 +888,8 @@ TaskGroup::Sync() { // // FIXME: bogus values for thread index/thread count here as well.. myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(), - myTask->taskIndex1(), myTask->taskIndex2(), myTask->taskIndex3(), - myTask->taskCount1(), myTask->taskCount2(), myTask->taskCount3()); + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the number of unfinished tasks counter @@ -920,8 +920,8 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 ti->func(ti->data, ti->taskIndex, ti->taskCount(), - ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), - ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -951,8 +951,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), - ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), - ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -984,8 +984,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadCount = ti->taskCount; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), - ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), - ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } @@ -1013,8 +1013,8 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), - ti->taskIndex1(), ti->taskIndex2(), ti->taskIndex3(), - ti->taskCount1(), ti->taskCount2(), ti->taskCount3()); + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } } @@ -1067,8 +1067,8 @@ FreeTaskGroup(TaskGroup *tg) { /////////////////////////////////////////////////////////////////////////// void -ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count1, int count2, int count3) { - const int count = count1*count2*count3; +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) { + const int count = count0*count1*count2; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); @@ -1084,9 +1084,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count1, int count2, ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = i; - ti->taskCount3d[0] = count1; - ti->taskCount3d[1] = count2; - ti->taskCount3d[2] = count3; + ti->taskCount3d[0] = count0; + ti->taskCount3d[1] = count1; + ti->taskCount3d[2] = count2; } taskGroup->Launch(baseIndex, count); } diff --git a/func.cpp b/func.cpp index 086be6fe..af2cc05a 100644 --- a/func.cpp +++ b/func.cpp @@ -133,26 +133,26 @@ Function::Function(Symbol *s, Stmt *c) { taskCountSym = m->symbolTable->LookupVariable("taskCount"); Assert(taskCountSym); + taskIndexSym0 = m->symbolTable->LookupVariable("taskIndex0"); + Assert(taskIndexSym0); taskIndexSym1 = m->symbolTable->LookupVariable("taskIndex1"); Assert(taskIndexSym1); taskIndexSym2 = m->symbolTable->LookupVariable("taskIndex2"); Assert(taskIndexSym2); - taskIndexSym3 = m->symbolTable->LookupVariable("taskIndex3"); - Assert(taskIndexSym3); + taskCountSym0 = m->symbolTable->LookupVariable("taskCount0"); + Assert(taskCountSym0); taskCountSym1 = m->symbolTable->LookupVariable("taskCount1"); Assert(taskCountSym1); taskCountSym2 = m->symbolTable->LookupVariable("taskCount2"); Assert(taskCountSym2); - taskCountSym3 = m->symbolTable->LookupVariable("taskCount3"); - Assert(taskCountSym3); } else { threadIndexSym = threadCountSym = taskIndexSym = taskCountSym = NULL; - taskIndexSym1 = taskIndexSym2 = taskIndexSym3 = NULL; - taskCountSym1 = taskCountSym2 = taskCountSym3 = NULL; + taskIndexSym0 = taskIndexSym1 = taskIndexSym2 = NULL; + taskCountSym0 = taskCountSym1 = taskCountSym2 = NULL; } } @@ -244,12 +244,12 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, llvm::Value *threadCount = argIter++; llvm::Value *taskIndex = argIter++; llvm::Value *taskCount = argIter++; + llvm::Value *taskIndex0 = argIter++; llvm::Value *taskIndex1 = argIter++; llvm::Value *taskIndex2 = argIter++; - llvm::Value *taskIndex3 = argIter++; + llvm::Value *taskCount0 = argIter++; llvm::Value *taskCount1 = argIter++; llvm::Value *taskCount2 = argIter++; - llvm::Value *taskCount3 = argIter++; // Copy the function parameter values from the structure into local // storage @@ -282,19 +282,19 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); ctx->StoreInst(taskCount, taskCountSym->storagePtr); + taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0"); + ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr); taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1"); ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr); taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2"); ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr); - taskIndexSym3->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex3"); - ctx->StoreInst(taskIndex3, taskIndexSym3->storagePtr); + taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0"); + ctx->StoreInst(taskCount0, taskCountSym0->storagePtr); taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1"); ctx->StoreInst(taskCount1, taskCountSym1->storagePtr); taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2"); ctx->StoreInst(taskCount2, taskCountSym2->storagePtr); - taskCountSym3->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount3"); - ctx->StoreInst(taskCount3, taskCountSym3->storagePtr); } else { // Regular, non-task function diff --git a/func.h b/func.h index 4181bba5..88a96dbc 100644 --- a/func.h +++ b/func.h @@ -61,9 +61,9 @@ private: Symbol *maskSymbol; Symbol *threadIndexSym, *threadCountSym; Symbol *taskIndexSym, *taskCountSym; + Symbol *taskIndexSym0, *taskCountSym0; Symbol *taskIndexSym1, *taskCountSym1; Symbol *taskIndexSym2, *taskCountSym2; - Symbol *taskIndexSym3, *taskCountSym3; }; #endif // ISPC_FUNC_H diff --git a/parse.yy b/parse.yy index 653bba62..9a0377c5 100644 --- a/parse.yy +++ b/parse.yy @@ -2276,20 +2276,20 @@ static void lAddThreadIndexCountToSymbolTable(SourcePos pos) { Symbol *taskCountSym = new Symbol("taskCount", pos, type); m->symbolTable->AddVariable(taskCountSym); + Symbol *taskIndexSym0 = new Symbol("taskIndex0", pos, type); + m->symbolTable->AddVariable(taskIndexSym0); Symbol *taskIndexSym1 = new Symbol("taskIndex1", pos, type); m->symbolTable->AddVariable(taskIndexSym1); Symbol *taskIndexSym2 = new Symbol("taskIndex2", pos, type); m->symbolTable->AddVariable(taskIndexSym2); - Symbol *taskIndexSym3 = new Symbol("taskIndex3", pos, type); - m->symbolTable->AddVariable(taskIndexSym3); + Symbol *taskCountSym0 = new Symbol("taskCount0", pos, type); + m->symbolTable->AddVariable(taskCountSym0); Symbol *taskCountSym1 = new Symbol("taskCount1", pos, type); m->symbolTable->AddVariable(taskCountSym1); Symbol *taskCountSym2 = new Symbol("taskCount2", pos, type); m->symbolTable->AddVariable(taskCountSym2); - Symbol *taskCountSym3 = new Symbol("taskCount3", pos, type); - m->symbolTable->AddVariable(taskCountSym3); } diff --git a/type.cpp b/type.cpp index 3ae0cab4..516276f0 100644 --- a/type.cpp +++ b/type.cpp @@ -2961,12 +2961,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { callTypes.push_back(LLVMTypes::Int32Type); // threadCount callTypes.push_back(LLVMTypes::Int32Type); // taskIndex callTypes.push_back(LLVMTypes::Int32Type); // taskCount + callTypes.push_back(LLVMTypes::Int32Type); // taskIndex0 callTypes.push_back(LLVMTypes::Int32Type); // taskIndex1 callTypes.push_back(LLVMTypes::Int32Type); // taskIndex2 - callTypes.push_back(LLVMTypes::Int32Type); // taskIndex3 + callTypes.push_back(LLVMTypes::Int32Type); // taskCount0 callTypes.push_back(LLVMTypes::Int32Type); // taskCount1 callTypes.push_back(LLVMTypes::Int32Type); // taskCount2 - callTypes.push_back(LLVMTypes::Int32Type); // taskCount3 } else // Otherwise we already have the types of the arguments From 84a7a5d1cbb85bb29c210828056326b83cea2d1b Mon Sep 17 00:00:00 2001 From: Evghenii Date: Sat, 26 Oct 2013 16:16:28 +0200 Subject: [PATCH 08/40] added tests for 3d launch --- test_static.cpp | 15 +++++++++------ tests/launch-8.ispc | 42 ++++++++++++++++++++++++++++++++++++++++++ tests/launch-9.ispc | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) create mode 100644 tests/launch-8.ispc create mode 100644 tests/launch-9.ispc diff --git a/test_static.cpp b/test_static.cpp index 8985fdb3..fceeb64e 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -62,17 +62,20 @@ extern "C" { extern void f_di(float *result, double *a, int *b); extern void result(float *val); - void ISPCLaunch(void **handlePtr, void *f, void *d, int); + void ISPCLaunch(void **handlePtr, void *f, void *d, int,int,int); void ISPCSync(void *handle); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); } - -void ISPCLaunch(void **handle, void *f, void *d, int count) { + +void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) { *handle = (void *)0xdeadbeef; - typedef void (*TaskFuncType)(void *, int, int, int, int); + typedef void (*TaskFuncType)(void *, int, int, int, int, int,int,int, int,int,int); TaskFuncType func = (TaskFuncType)f; - for (int i = 0; i < count; ++i) - func(d, 0, 1, i, count); + int count = count0*count1*count2, idx = 0; + for (int k = 0; k < count2; ++k) + for (int j = 0; j < count1; ++j) + for (int i = 0; i < count0; ++i) + func(d, 0, 1, idx++, count, i,j,k,count0,count1,count2); } void ISPCSync(void *) { diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc new file mode 100644 index 00000000..a0b976e4 --- /dev/null +++ b/tests/launch-8.ispc @@ -0,0 +1,42 @@ + +export uniform int width() { return programCount; } + + +#define N0 10 +#define N1 20 +#define N2 50 +static uniform float array[N2][N1][N0]; + +task void x(const float f) { + uniform int j; + + assert(taskCount == N0*N1*N2); + assert(taskCount0 == N0); + assert(taskCount1 == N1); + assert(taskCount2 == N2); + assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); + assert(taskIndex0 < N0); + assert(taskIndex1 < N1); + assert(taskIndex2 < N2); + + const uniform int i0 = taskIndex0; + const uniform int i1 = taskIndex1; + const uniform int i2 = taskIndex2; + const uniform int i = taskIndex; + array[i2][i1][i0] = i / 10000.; + cfor (j = 0; j < 10000; ++j) + array[i2][i1][i0] = sin(array[i2][i1][i0]); + if (array[i2][i1][i0] < .02) + array[i2][i1][i0] = i; +} +export void f_f(uniform float RET[], uniform float fFOO[]) { + float f = fFOO[programIndex]; + launch[N2][N1][N0] x(f); + sync; + RET[programIndex] = array[N2-1][N1-1][N0-1]; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 9999.000000; +} diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc new file mode 100644 index 00000000..761b070c --- /dev/null +++ b/tests/launch-9.ispc @@ -0,0 +1,42 @@ + +export uniform int width() { return programCount; } + + +#define N0 10 +#define N1 20 +#define N2 50 +static uniform float array[N2][N1][N0]; + +task void x(const float f) { + uniform int j; + + assert(taskCount == N0*N1*N2); + assert(taskCount0 == N0); + assert(taskCount1 == N1); + assert(taskCount2 == N2); + assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); + assert(taskIndex0 < N0); + assert(taskIndex1 < N1); + assert(taskIndex2 < N2); + + const uniform int i0 = taskIndex0; + const uniform int i1 = taskIndex1; + const uniform int i2 = taskIndex2; + const uniform int i = taskIndex; + array[i2][i1][i0] = i / 10000.; + cfor (j = 0; j < 10000; ++j) + array[i2][i1][i0] = sin(array[i2][i1][i0]); + if (array[i2][i1][i0] < .02) + array[i2][i1][i0] = i; +} +export void f_f(uniform float RET[], uniform float fFOO[]) { + float f = fFOO[programIndex]; + launch[N0,N1,N2] x(f); + sync; + RET[programIndex] = array[N2-1][N1-1][N0-1]; +} + + +export void result(uniform float RET[]) { + RET[programIndex] = 9999.000000; +} From 2d2d14744b2baa32c9129664c2f3816df5b915ff Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 4 Dec 2013 19:00:02 +0400 Subject: [PATCH 09/40] Fixing --opt=force-aligned-memory for LLVM 3.3+ --- ctx.cpp | 30 +++++++++++++++++++++++++----- ispc.cpp | 26 ++++++++++++++++++++++++++ ispc.h | 9 +++++++++ opt.cpp | 16 ++++++++++++---- 4 files changed, 72 insertions(+), 9 deletions(-) diff --git a/ctx.cpp b/ctx.cpp index c1a7e61a..e5c60363 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -316,7 +316,11 @@ FunctionEmitContext::FunctionEmitContext(Function *func, Symbol *funSym, llvm::BasicBlock *offBB = llvm::BasicBlock::Create(*g->ctx, "entry", (llvm::Function *)offFunc, 0); - new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + llvm::StoreInst *inst = + new llvm::StoreInst(LLVMMaskAllOff, globalAllOnMaskPtr, offBB); + if (g->opt.forceAlignedMemory) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } llvm::ReturnInst::Create(*g->ctx, offBB); } @@ -2437,7 +2441,13 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) { if (name == NULL) name = LLVMGetName(ptr, "_load"); - llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock); + llvm::LoadInst *inst = new llvm::LoadInst(ptr, name, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); return inst; } @@ -2719,7 +2729,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock); // If no alignment was specified but we have an array of a uniform - // type, then align it to 4 * the native vector width; it's not + // type, then align it to the native vector alignment; it's not // unlikely that this array will be loaded into varying variables with // what will be aligned accesses if the uniform -> varying load is done // in regular chunks. @@ -2727,7 +2737,7 @@ FunctionEmitContext::AllocaInst(llvm::Type *llvmType, llvm::dyn_cast(llvmType); if (align == 0 && arrayType != NULL && !llvm::isa(arrayType->getElementType())) - align = 4 * g->target->getNativeVectorWidth(); + align = g->target->getNativeVectorAlignment(); if (align != 0) inst->setAlignment(align); @@ -2986,7 +2996,17 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) { return; } - llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock); + llvm::PointerType *pt = + llvm::dyn_cast(ptr->getType()); + AssertPos(currentPos, pt != NULL); + + llvm::StoreInst *inst = new llvm::StoreInst(value, ptr, bblock); + + if (g->opt.forceAlignedMemory && + llvm::dyn_cast(pt->getElementType())) { + inst->setAlignment(g->target->getNativeVectorAlignment()); + } + AddDebugPos(inst); } diff --git a/ispc.cpp b/ispc.cpp index 36d31580..b1790dc3 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -191,6 +191,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : m_tf_attributes(NULL), #endif m_nativeVectorWidth(-1), + m_nativeVectorAlignment(-1), m_dataTypeWidth(-1), m_vectorWidth(-1), m_generatePIC(pic), @@ -309,6 +310,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x4")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -325,6 +327,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse2-i32x8")) { this->m_isa = Target::SSE2; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,-sse3,-sse4a,-ssse3,-popcnt" @@ -341,6 +344,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x4")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; // TODO: why not sse42 and popcnt? @@ -359,6 +363,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "sse4-i32x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -374,6 +379,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i8x16")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -389,6 +395,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "sse4-i16x8")) { this->m_isa = Target::SSE4; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+sse,+sse2,+sse3,-sse4a,+ssse3,-popcnt,+cmov" @@ -405,6 +412,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x4")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 4; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -416,6 +424,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x8")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_vectorWidth = 8; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -427,6 +436,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x16")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 16; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -438,6 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x32")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 32; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 32; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -449,6 +460,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x64")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 64; + this->m_nativeVectorAlignment = 64; this->m_vectorWidth = 64; this->m_maskingIsFree = true; this->m_maskBitCount = 1; @@ -460,6 +472,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "generic-x1")) { this->m_isa = Target::GENERIC; this->m_nativeVectorWidth = 1; + this->m_nativeVectorAlignment = 16; this->m_vectorWidth = 1; this->m_maskingIsFree = false; this->m_maskBitCount = 32; @@ -467,6 +480,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1-i32x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -478,6 +492,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x8")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -488,6 +503,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i64x4")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -499,6 +515,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1-i32x16")) { this->m_isa = Target::AVX; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov"; @@ -509,6 +526,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x8")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -530,6 +548,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx1.1-i32x16")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -550,6 +569,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx1.1-i64x4")) { this->m_isa = Target::AVX11; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx,+popcnt,+cmov,+f16c" @@ -571,6 +591,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x8")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 8; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -596,6 +617,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "avx2-i32x16")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 32; this->m_vectorWidth = 16; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -620,6 +642,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "avx2-i64x4")) { this->m_isa = Target::AVX2; this->m_nativeVectorWidth = 8; /* native vector width in terms of floats */ + this->m_nativeVectorAlignment = 32; this->m_dataTypeWidth = 64; this->m_vectorWidth = 4; this->m_attributes = "+avx2,+popcnt,+cmov,+f16c" @@ -645,6 +668,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i8x16")) { this->m_isa = Target::NEON8; this->m_nativeVectorWidth = 16; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 8; this->m_vectorWidth = 16; this->m_attributes = "+neon,+fp16"; @@ -655,6 +679,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : else if (!strcasecmp(isa, "neon-i16x8")) { this->m_isa = Target::NEON16; this->m_nativeVectorWidth = 8; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 16; this->m_vectorWidth = 8; this->m_attributes = "+neon,+fp16"; @@ -666,6 +691,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : !strcasecmp(isa, "neon-i32x4")) { this->m_isa = Target::NEON32; this->m_nativeVectorWidth = 4; + this->m_nativeVectorAlignment = 16; this->m_dataTypeWidth = 32; this->m_vectorWidth = 4; this->m_attributes = "+neon,+fp16"; diff --git a/ispc.h b/ispc.h index b319d656..4b333861 100644 --- a/ispc.h +++ b/ispc.h @@ -260,6 +260,8 @@ public: int getNativeVectorWidth() const {return m_nativeVectorWidth;} + int getNativeVectorAlignment() const {return m_nativeVectorAlignment;} + int getDataTypeWidth() const {return m_dataTypeWidth;} int getVectorWidth() const {return m_vectorWidth;} @@ -332,6 +334,13 @@ private: SSE, 8 for AVX, etc.) */ int m_nativeVectorWidth; + /** Native vector alignment in bytes. Theoretically this may be derived + from the vector size, but it's better to manage directly the alignement. + It allows easier experimenting and better fine tuning for particular + platform. This information is primatily used when + --opt=force-aligned-memory is used. */ + int m_nativeVectorAlignment; + /** Data type with in bits. Typically it's 32, but could be 8, 16 or 64. For generic it's -1, which means undefined. */ int m_dataTypeWidth; diff --git a/opt.cpp b/opt.cpp index 3e320b4b..9059c746 100644 --- a/opt.cpp +++ b/opt.cpp @@ -904,7 +904,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { lCopyMetadata(castPtr, callInst); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8; name = LLVMGetName(callInst->getArgOperand(0), "_load"); @@ -946,7 +946,7 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) { new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL); int align; if (g->opt.forceAlignedMemory) - align = 0; + align = g->target->getNativeVectorAlignment(); else align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8; storeInst->setAlignment(align); @@ -2758,7 +2758,8 @@ lImproveMaskedStore(llvm::CallInst *callInst) { lCopyMetadata(lvalue, callInst); llvm::Instruction *store = new llvm::StoreInst(rvalue, lvalue, false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align); + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align); lCopyMetadata(store, callInst); llvm::ReplaceInstWithInst(callInst, store); return true; @@ -2821,7 +2822,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst, callInst); llvm::Instruction *load = new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */, - g->opt.forceAlignedMemory ? 0 : info->align, + g->opt.forceAlignedMemory ? + g->target->getNativeVectorAlignment() : info->align, (llvm::Instruction *)NULL); lCopyMetadata(load, callInst); llvm::ReplaceInstWithInst(callInst, load); @@ -3226,6 +3228,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 4: { // 4-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 4); loadOps[i].load = lGEPAndLoad(basePtr, start, align, @@ -3234,6 +3239,9 @@ lEmitLoads(llvm::Value *basePtr, std::vector &loadOps, } case 8: { // 8-wide vector load + if (g->opt.forceAlignedMemory) { + align = g->target->getNativeVectorAlignment(); + } llvm::VectorType *vt = llvm::VectorType::get(LLVMTypes::Int32Type, 8); loadOps[i].load = lGEPAndLoad(basePtr, start, align, From 040ef0bc491a956dd3fbcae31f1f456f25e785ec Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 4 Dec 2013 16:29:15 +0400 Subject: [PATCH 10/40] Adding r196261 patch to previous vzeroupper fix for 3.3 --- ...93261_bug17631_196261_win_vzeroupper.patch | 115 ++++++++++++++++++ .../3_3_r193261_bug17631_win_vzeroupper.patch | 69 ----------- 2 files changed, 115 insertions(+), 69 deletions(-) create mode 100644 llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch delete mode 100644 llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch diff --git a/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch new file mode 100644 index 00000000..8f0a790b --- /dev/null +++ b/llvm_patches/3_3_r193261_bug17631_196261_win_vzeroupper.patch @@ -0,0 +1,115 @@ +From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 +From: Michael Liao +Date: Mon, 21 Oct 2013 17:47:58 -0700 +Subject: [PATCH] Fix PR17631 + +- Skip instructions added in prolog. For specific targets, prolog may + insert helper function calls (e.g. _chkstk will be called when + there're more than 4K bytes allocated on stack). However, these + helpers don't use/def YMM/XMM registers. + It also include second fix for the problem: r196261+r196391. + +diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp +index 477f75a..0d37a7d 100644 +--- lib/Target/X86/X86VZeroUpper.cpp ++++ lib/Target/X86/X86VZeroUpper.cpp +@@ -121,7 +121,7 @@ + } + + static bool clobbersAllYmmRegs(const MachineOperand &MO) { +- for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) { ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { + if (!MO.clobbersPhysReg(reg)) + return false; + } +@@ -143,6 +143,21 @@ + return false; + } + ++/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this ++/// instruction. ++static bool clobbersAnyYmmReg(MachineInstr *MI) { ++ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { ++ const MachineOperand &MO = MI->getOperand(i); ++ if (!MO.isRegMask()) ++ continue; ++ for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { ++ if (MO.clobbersPhysReg(reg)) ++ return true; ++ } ++ } ++ return false; ++} ++ + /// runOnMachineFunction - Loop over all of the basic blocks, inserting + /// vzero upper instructions before function calls. + bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { +@@ -226,8 +241,9 @@ + bool BBHasCall = false; + + for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { ++ DebugLoc dl = I->getDebugLoc(); + MachineInstr *MI = I; +- DebugLoc dl = I->getDebugLoc(); ++ + bool isControlFlow = MI->isCall() || MI->isReturn(); + + // Shortcut: don't need to check regular instructions in dirty state. +@@ -246,6 +262,14 @@ + if (!isControlFlow) + continue; + ++ // If the call won't clobber any YMM register, skip it as well. It usually ++ // happens on helper function calls (such as '_chkstk', '_ftol2') where ++ // standard calling convention is not used (RegMask is not used to mark ++ // register clobbered and register usage (def/imp-def/use) is well-dfined ++ // and explicitly specified. ++ if (MI->isCall() && !clobbersAnyYmmReg(MI)) ++ continue; ++ + BBHasCall = true; + + // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX +diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll +new file mode 100644 +index 0000000..a572ff2 +--- /dev/null ++++ test/CodeGen/X86/pr17631.ll +@@ -0,0 +1,34 @@ ++; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s ++ ++%struct_type = type { [64 x <8 x float>], <8 x float> } ++ ++; Function Attrs: nounwind readnone ++declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) ++ ++; Function Attrs: nounwind ++define i32 @equal(<8 x i32> %A) { ++allocas: ++ %first_alloc = alloca [64 x <8 x i32>] ++ %second_alloc = alloca %struct_type ++ ++ %A1 = bitcast <8 x i32> %A to <8 x float> ++ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) ++ ret i32 %A2 ++} ++ ++; CHECK: equal ++; CHECK-NOT: vzeroupper ++; CHECK: _chkstk ++; CHECK: ret ++ ++define <8 x float> @foo(<8 x float> %y, i64* %p, double %x) { ++ %i = fptoui double %x to i64 ++ store i64 %i, i64* %p ++ %ret = fadd <8 x float> %y, %y ++ ret <8 x float> %ret ++} ++ ++; CHECK: foo ++; CHECK-NOT: vzeroupper ++; CHECK: _ftol2 ++; CHECK: ret +-- +1.8.1.2 + diff --git a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch b/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch deleted file mode 100644 index b6abb1d3..00000000 --- a/llvm_patches/3_3_r193261_bug17631_win_vzeroupper.patch +++ /dev/null @@ -1,69 +0,0 @@ -From b9b016cda57d8afc26a150de7ee329b54a994c85 Mon Sep 17 00:00:00 2001 -From: Michael Liao -Date: Mon, 21 Oct 2013 17:47:58 -0700 -Subject: [PATCH] Fix PR17631 - -- Skip instructions added in prolog. For specific targets, prolog may - insert helper function calls (e.g. _chkstk will be called when - there're more than 4K bytes allocated on stack). However, these - helpers don't use/def YMM/XMM registers. ---- - lib/Target/X86/X86VZeroUpper.cpp | 11 ++++++++++- - test/CodeGen/X86/pr17631.ll | 22 ++++++++++++++++++++++ - 2 files changed, 32 insertions(+), 1 deletion(-) - create mode 100644 test/CodeGen/X86/pr17631.ll - -diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp -index 477f75a..0d37a7d 100644 ---- lib/Target/X86/X86VZeroUpper.cpp -+++ lib/Target/X86/X86VZeroUpper.cpp -@@ -231,8 +231,17 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF, - bool BBHasCall = false; - - for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) { -- MachineInstr *MI = I; - DebugLoc dl = I->getDebugLoc(); -+ MachineInstr *MI = I; -+ -+ // Don't need to check instructions added in prolog. -+ // In prolog, special function calls may be added for specific targets -+ // (e.g. on Windows, a prolog helper '_chkstk' is called when the local -+ // variables exceed 4K bytes on stack.) These helpers won't use/def YMM/XMM -+ // registers. -+ if (MI->getFlag(MachineInstr::FrameSetup)) -+ continue; -+ - bool isControlFlow = MI->isCall() || MI->isReturn(); - - // Shortcut: don't need to check regular instructions in dirty state. -diff --git a/test/CodeGen/X86/pr17631.ll b/test/CodeGen/X86/pr17631.ll -new file mode 100644 -index 0000000..a572ff2 ---- /dev/null -+++ test/CodeGen/X86/pr17631.ll -@@ -0,0 +1,22 @@ -+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-pc-win32 | FileCheck %s -+ -+%struct_type = type { [64 x <8 x float>], <8 x float> } -+ -+; Function Attrs: nounwind readnone -+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) -+ -+; Function Attrs: nounwind -+define i32 @equal(<8 x i32> %A) { -+allocas: -+ %first_alloc = alloca [64 x <8 x i32>] -+ %second_alloc = alloca %struct_type -+ -+ %A1 = bitcast <8 x i32> %A to <8 x float> -+ %A2 = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %A1) -+ ret i32 %A2 -+} -+ -+; CHECK: equal -+; CHECK-NOT: vzeroupper -+; CHECK: _chkstk -+; CHECK: ret --- -1.8.1.2 - From 5012ba34b4b5952866ca7d4fe512c7cbd77be3e4 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 5 Dec 2013 19:38:46 +0400 Subject: [PATCH 11/40] increase data for examples --- examples/mandelbrot_tasks/mandelbrot_tasks.cpp | 4 ++-- examples/noise/noise.cpp | 4 ++-- examples/stencil/stencil.cpp | 2 +- perf.ini | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..1c4d2ca5 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -74,8 +74,8 @@ static void usage() { } int main(int argc, char *argv[]) { - unsigned int width = 1536; - unsigned int height = 1024; + unsigned int width = 1536 * 8; + unsigned int height = 1024 * 8; float x0 = -2; float x1 = 1; float y0 = -1; diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 123f98c7..86b4f761 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -66,8 +66,8 @@ writePPM(float *buf, int width, int height, const char *fn) { int main() { - unsigned int width = 768; - unsigned int height = 768; + unsigned int width = 768 * 4; + unsigned int height = 768 * 4; float x0 = -10; float x1 = 10; float y0 = -10; diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 593d901f..9cd12674 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -67,7 +67,7 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) { int main() { - int Nx = 256, Ny = 256, Nz = 256; + int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2; int width = 4; float *Aserial[2], *Aispc[2]; Aserial[0] = new float [Nx * Ny * Nz]; diff --git a/perf.ini b/perf.ini index 249c25f4..b44a2853 100755 --- a/perf.ini +++ b/perf.ini @@ -10,7 +10,7 @@ %**************************************************************************************************** AOBench aobench -10 512 512 +3 2048 2048 #*** Deferred Shading deferred @@ -41,7 +41,7 @@ options #*** Ray Tracer rt -sponza +sponza --scale=6.0 #*** 3D Stencil stencil From 2951cad365f7dbaf4bc3f73b0a822ddd4a40357b Mon Sep 17 00:00:00 2001 From: evghenii Date: Mon, 9 Dec 2013 13:10:26 +0100 Subject: [PATCH 12/40] added description for multi-dimensional tasking --- docs/ispc.rst | 63 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index eac9b24e..3aab730b 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3012,8 +3012,8 @@ Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as appropriate. Alternatively, ``ispc`` also has support for launching tasks from ``ispc`` code. The approach is similar to Intel® Cilk's task launch -feature. (See the ``examples/mandelbrot_tasks`` example to see it used in -a small example.) +feature. (See the ``examples/mandelbrot_tasks`` and +``examples/mandelbrot_tasks3d`` examples to see it used in a small example.) Any function that is launched as a task must be declared with the ``task`` qualifier: @@ -3108,6 +3108,38 @@ executing the current task. The ``threadIndex`` can be used for accessing data that is private to the current thread and thus doesn't require synchronization to access under parallel execution. +The tasking system also supports multi-dimensional partitioning (currently up +to three dimensions). To launch a 3D grid of tasks, for example with ``N0``, +``N1`` and ``N2`` tasks in x-, y- and z-dimension respectively + +:: + + float data[N2][N1][N0] + task void foo_task() + { + data[taskIndex2][taskIndex1][threadIndex0] = taskIndex; + } + +we use the following ``launch`` expressions: + +:: + + launch [N2][N1][N0] foo_task() + +or + +:: + + launch [N0,N1,N2] foo_task() + +Value of ``taskIndex`` is equal to ``taskIndex0 + taskCount0*(taskIndex1 + +taskCount1*taskIndex2)`` and it ranges from ``0`` to ``taskCount-1``, where +``taskCount = taskCount0*taskCount1*taskCount2``. If ``N1`` or/and ``N2`` are +not specified in the ``launch`` expression, a value of ``1`` is assumed. +Finally, for an one-dimensional grid of tasks, ``taskIndex`` is equivalent to +``taskIndex0`` and ``taskCount`` is equivalent to ``taskCount0``. + + Task Parallelism: Runtime Requirements -------------------------------------- @@ -3138,7 +3170,7 @@ manage tasks in ``ispc``: :: void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); - void ISPCLaunch(void **handlePtr, void *f, void *data, int count); + void ISPCLaunch(void **handlePtr, void *f, void *data, int count0, int count1, int count2); void ISPCSync(void *handle); All three of these functions take an opaque handle (or a pointer to an @@ -3175,16 +3207,20 @@ tasks. Each ``launch`` statement in ``ispc`` code causes a call to after the handle pointer to the function are relatively straightforward; the ``void *f`` parameter holds a pointer to a function to call to run the work for this task, ``data`` holds a pointer to data to pass to this -function, and ``count`` is the number of instances of this function to -enqueue for asynchronous execution. (In other words, ``count`` corresponds -to the value ``n`` in a multiple-task launch statement like ``launch[n]``.) +function, and ``count0``, ``count1`` and ``count2`` are the number of instances +of this function to enqueue for asynchronous execution. (In other words, +``count0``, ``count1`` and ``count2`` correspond to the value ``n0``, ``n1`` +and ``n2`` in a multiple-task launch statement like ``launch[n2][n1][n0]`` or +``launch [n0,n1,n2]`` respectively.) The signature of the provided function pointer ``f`` is :: void (*TaskFuncPtr)(void *data, int threadIndex, int threadCount, - int taskIndex, int taskCount) + int taskIndex, int taskCount, + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); When this function pointer is called by one of the hardware threads managed by the task system, the ``data`` pointer passed to ``ISPCLaunch()`` should @@ -3194,11 +3230,14 @@ number of hardware threads that have been spawned to run tasks and uniquely identifying the hardware thread that is running the task. (These values can be used to index into thread-local storage.) -The value of ``taskCount`` should be the number of tasks launched in the -``launch`` statement that caused the call to ``ISPCLaunch()`` and each of -the calls to this function should be given a unique value of ``taskIndex`` -between zero and ``taskCount``, to distinguish which of the instances -of the set of launched tasks is running. +The value of ``taskCount`` should be the total number of tasks launched in the +``launch`` statement (it must be equal to ``taskCount0*taskCount1*taskCount2``) +that caused the call to ``ISPCLaunch()`` and each of the calls to this function +should be given a unique value of ``taskIndex``, ``taskIndex0``, ``taskIndex1`` +and ``taskIndex2`` between zero and ``taskCount``, ``taskCount0``, +``taskCount1`` and ``taskCount2`` respectively, with ``taskIndex = taskIndex0 ++ taskCount0*(taskIndex1 + taskCount1*taskIndex2)``, to distinguish which of +the instances of the set of launched tasks is running. From 98c56c214a389208d6f5bccfde331ccc37755daf Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 6 Dec 2013 18:57:35 +0400 Subject: [PATCH 13/40] changing of examples --- examples/aobench/ao.cpp | 25 +++++++++----- examples/deferred/main.cpp | 22 +++++++++---- examples/mandelbrot/mandelbrot.cpp | 23 +++++++++++-- .../mandelbrot_tasks/mandelbrot_tasks.cpp | 27 ++++++++------- examples/noise/noise.cpp | 26 ++++++++++++--- examples/rt/rt.cpp | 33 ++++++++++--------- examples/stencil/stencil.cpp | 31 +++++++++++++---- examples/volume_rendering/volume.cpp | 19 ++++++++--- perf.ini | 14 +++++--- perf.py | 21 +++++++----- 10 files changed, 166 insertions(+), 75 deletions(-) diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp index 2286316d..b4e2833d 100644 --- a/examples/aobench/ao.cpp +++ b/examples/aobench/ao.cpp @@ -60,7 +60,7 @@ using namespace ispc; extern void ao_serial(int w, int h, int nsubsamples, float image[]); -static unsigned int test_iterations; +static unsigned int test_iterations[] = {3, 7, 1}; static unsigned int width, height; static unsigned char *img; static float *fimg; @@ -106,16 +106,20 @@ savePPM(const char *fname, int w, int h) int main(int argc, char **argv) { - if (argc != 4) { + if (argc < 3) { printf ("%s\n", argv[0]); - printf ("Usage: ao [num test iterations] [width] [height]\n"); + printf ("Usage: ao [width] [height] [ispc iterations] [tasks iterations] [serial iterations]\n"); getchar(); exit(-1); } else { - test_iterations = atoi(argv[1]); - width = atoi (argv[2]); - height = atoi (argv[3]); + if (argc == 6) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[3 + i]); + } + } + width = atoi (argv[1]); + height = atoi (argv[2]); } // Allocate space for output images @@ -127,13 +131,14 @@ int main(int argc, char **argv) // time for any of them. // double minTimeISPC = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[0]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); ao_ispc(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", t); minTimeISPC = std::min(minTimeISPC, t); } @@ -147,13 +152,14 @@ int main(int argc, char **argv) // minimum time for any of them. // double minTimeISPCTasks = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[1]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); assert(NSUBSAMPLES == 2); reset_and_start_timer(); ao_ispc_tasks(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", t); minTimeISPCTasks = std::min(minTimeISPCTasks, t); } @@ -167,11 +173,12 @@ int main(int argc, char **argv) // minimum time. // double minTimeSerial = 1e30; - for (unsigned int i = 0; i < test_iterations; i++) { + for (unsigned int i = 0; i < test_iterations[2]; i++) { memset((void *)fimg, 0, sizeof(float) * width * height * 3); reset_and_start_timer(); ao_serial(width, height, NSUBSAMPLES, fimg); double t = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t\t[%.3f] million cycles\n", t); minTimeSerial = std::min(minTimeSerial, t); } diff --git a/examples/deferred/main.cpp b/examples/deferred/main.cpp index 4f2be879..d7f62f50 100644 --- a/examples/deferred/main.cpp +++ b/examples/deferred/main.cpp @@ -62,10 +62,16 @@ /////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { - if (argc != 2) { - printf("usage: deferred_shading \n"); + if (argc < 2) { + printf("usage: deferred_shading [tasks iterations] [serial iterations]\n"); return 1; } + static unsigned int test_iterations[] = {5, 3, 500}; //last value is for nframes, it is scale. + if (argc == 5) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[2 + i]); + } + } InputData *input = CreateInputDataFromFile(argv[1]); if (!input) { @@ -81,9 +87,9 @@ int main(int argc, char** argv) { InitDynamicCilk(input); #endif // __cilk - int nframes = 5; + int nframes = test_iterations[2]; double ispcCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) @@ -91,6 +97,7 @@ int main(int argc, char** argv) { VISUALIZE_LIGHT_COUNT, framebuffer.r, framebuffer.g, framebuffer.b); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", mcycles); ispcCycles = std::min(ispcCycles, mcycles); } printf("[ispc static + tasks]:\t\t[%.3f] million cycles to render " @@ -98,14 +105,16 @@ int main(int argc, char** argv) { input->header.framebufferWidth, input->header.framebufferHeight); WriteFrame("deferred-ispc-static.ppm", input, framebuffer); + nframes = 3; #ifdef __cilk double dynamicCilkCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicCilk(input, &framebuffer); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); dynamicCilkCycles = std::min(dynamicCilkCycles, mcycles); } printf("[ispc + Cilk dynamic]:\t\t[%.3f] million cycles to render image\n", @@ -114,12 +123,13 @@ int main(int argc, char** argv) { #endif // __cilk double serialCycles = 1e30; - for (int i = 0; i < 5; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { framebuffer.clear(); reset_and_start_timer(); for (int j = 0; j < nframes; ++j) DispatchDynamicC(input, &framebuffer); double mcycles = get_elapsed_mcycles() / nframes; + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", mcycles); serialCycles = std::min(serialCycles, mcycles); } printf("[C++ serial dynamic, 1 core]:\t[%.3f] million cycles to render image\n", diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp index d2bebb96..fafc00d0 100644 --- a/examples/mandelbrot/mandelbrot.cpp +++ b/examples/mandelbrot/mandelbrot.cpp @@ -42,6 +42,7 @@ #include #include "../timing.h" #include "mandelbrot_ispc.h" +#include using namespace ispc; extern void mandelbrot_serial(float x0, float y0, float x1, float y1, @@ -67,7 +68,8 @@ writePPM(int *buf, int width, int height, const char *fn) { } -int main() { +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 3}; unsigned int width = 768; unsigned int height = 512; float x0 = -2; @@ -75,6 +77,19 @@ int main() { float y0 = -1; float y1 = 1; + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + width *= scale; + height *= scale; + } + } + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } + int maxIterations = 256; int *buf = new int[width*height]; @@ -83,10 +98,11 @@ int main() { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -102,10 +118,11 @@ int main() { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 1c4d2ca5..32db45bc 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -69,21 +69,20 @@ writePPM(int *buf, int width, int height, const char *fn) { static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); + fprintf(stderr, "usage: mandelbrot [--scale=] [tasks iterations] [serial iterations]\n"); exit(1); } int main(int argc, char *argv[]) { - unsigned int width = 1536 * 8; - unsigned int height = 1024 * 8; + static unsigned int test_iterations[] = {7, 1}; + unsigned int width = 1536; + unsigned int height = 1024; float x0 = -2; float x1 = 1; float y0 = -1; float y1 = 1; - if (argc == 1) - ; - else if (argc == 2) { + if (argc > 1) { if (strncmp(argv[1], "--scale=", 8) == 0) { float scale = atof(argv[1] + 8); if (scale == 0.f) @@ -94,11 +93,13 @@ int main(int argc, char *argv[]) { width = (width + 0xf) & ~0xf; height = (height + 0xf) & ~0xf; } - else - usage(); } - else - usage(); + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } + int maxIterations = 512; int *buf = new int[width*height]; @@ -108,13 +109,14 @@ int main(int argc, char *argv[]) { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { // Clear out the buffer for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -127,13 +129,14 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { // Clear out the buffer for (unsigned int i = 0; i < width * height; ++i) buf[i] = 0; reset_and_start_timer(); mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/noise/noise.cpp b/examples/noise/noise.cpp index 86b4f761..0664bbd9 100644 --- a/examples/noise/noise.cpp +++ b/examples/noise/noise.cpp @@ -42,6 +42,7 @@ #include #include "../timing.h" #include "noise_ispc.h" +#include using namespace ispc; extern void noise_serial(float x0, float y0, float x1, float y1, @@ -65,14 +66,27 @@ writePPM(float *buf, int width, int height, const char *fn) { } -int main() { - unsigned int width = 768 * 4; - unsigned int height = 768 * 4; +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 1}; + unsigned int width = 768; + unsigned int height = 768; float x0 = -10; float x1 = 10; float y0 = -10; float y1 = 10; + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + width *= scale; + height *= scale; + } + } + if ((argc == 3) || (argc == 4)) { + for (int i = 0; i < 2; i++) { + test_iterations[i] = atoi(argv[argc - 2 + i]); + } + } float *buf = new float[width*height]; // @@ -80,10 +94,11 @@ int main() { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); noise_ispc(x0, y0, x1, y1, width, height, buf); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -99,10 +114,11 @@ int main() { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); noise_serial(x0, y0, x1, y1, width, height, buf); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp index 48bcc423..8f61656a 100644 --- a/examples/rt/rt.cpp +++ b/examples/rt/rt.cpp @@ -96,27 +96,27 @@ static void writeImage(int *idImage, float *depthImage, int width, int height, static void usage() { - fprintf(stderr, "rt [--scale=] \n"); + fprintf(stderr, "rt [--scale=] [ispc iterations] [tasks iterations] [serial iterations]\n"); exit(1); } int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 7, 1}; float scale = 1.f; const char *filename = NULL; - for (int i = 1; i < argc; ++i) { - if (strncmp(argv[i], "--scale=", 8) == 0) { - scale = atof(argv[i] + 8); - if (scale == 0.f) - usage(); + if (argc < 2) usage(); + filename = argv[1]; + if (argc > 2) { + if (strncmp(argv[2], "--scale=", 8) == 0) { + scale = atof(argv[2] + 8); + } + } + if ((argc == 6) || (argc == 5)) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[argc - 3 + i]); } - else if (filename != NULL) - usage(); - else - filename = argv[i]; } - if (filename == NULL) - usage(); #define READ(var, n) \ if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \ @@ -211,11 +211,12 @@ int main(int argc, char *argv[]) { // Run 3 iterations with ispc + 1 core, record the minimum time // double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); raytrace_ispc(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPC = std::min(dt, minTimeISPC); } printf("[rt ispc, 1 core]:\t\t[%.3f] million cycles for %d x %d image\n", @@ -230,11 +231,12 @@ int main(int argc, char *argv[]) { // Run 3 iterations with ispc + 1 core, record the minimum time // double minTimeISPCtasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); raytrace_ispc_tasks(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPCtasks = std::min(dt, minTimeISPCtasks); } printf("[rt ispc + tasks]:\t\t[%.3f] million cycles for %d x %d image\n", @@ -250,11 +252,12 @@ int main(int argc, char *argv[]) { // minimum time. // double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); raytrace_serial(width, height, baseWidth, baseHeight, raster2camera, camera2world, image, id, nodes, triangles); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minTimeSerial = std::min(dt, minTimeSerial); } printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", diff --git a/examples/stencil/stencil.cpp b/examples/stencil/stencil.cpp index 9cd12674..33abc85c 100644 --- a/examples/stencil/stencil.cpp +++ b/examples/stencil/stencil.cpp @@ -40,6 +40,7 @@ #include #include +#include #include #include "../timing.h" #include "stencil_ispc.h" @@ -66,9 +67,25 @@ void InitData(int Nx, int Ny, int Nz, float *A[2], float *vsq) { } -int main() { - int Nx = 256 * 2, Ny = 256 * 2, Nz = 256 * 2; +int main(int argc, char *argv[]) { + static unsigned int test_iterations[] = {3, 3, 3};//the last two numbers must be equal here + int Nx = 256, Ny = 256, Nz = 256; int width = 4; + + if (argc > 1) { + if (strncmp(argv[1], "--scale=", 8) == 0) { + float scale = atof(argv[1] + 8); + Nx *= scale; + Ny *= scale; + Nz *= scale; + } + } + if ((argc == 4) || (argc == 5)) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[argc - 3 + i]); + } + } + float *Aserial[2], *Aispc[2]; Aserial[0] = new float [Nx * Ny * Nz]; Aserial[1] = new float [Nx * Ny * Nz]; @@ -79,18 +96,18 @@ int main() { float coeff[4] = { 0.5, -.25, .125, -.0625 }; InitData(Nx, Ny, Nz, Aispc, vsq); - // // Compute the image using the ispc implementation on one core; report // the minimum time of three runs. // double minTimeISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); loop_stencil_ispc(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPC = std::min(minTimeISPC, dt); } @@ -103,12 +120,13 @@ int main() { // the minimum time of three runs. // double minTimeISPCTasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); loop_stencil_ispc_tasks(0, 6, width, Nx - width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aispc[0], Aispc[1]); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minTimeISPCTasks = std::min(minTimeISPCTasks, dt); } @@ -121,12 +139,13 @@ int main() { // minimum time. // double minTimeSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); loop_stencil_serial(0, 6, width, Nx-width, width, Ny - width, width, Nz - width, Nx, Ny, Nz, coeff, vsq, Aserial[0], Aserial[1]); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minTimeSerial = std::min(minTimeSerial, dt); } diff --git a/examples/volume_rendering/volume.cpp b/examples/volume_rendering/volume.cpp index 458cd407..b6eda986 100644 --- a/examples/volume_rendering/volume.cpp +++ b/examples/volume_rendering/volume.cpp @@ -135,10 +135,16 @@ loadVolume(const char *fn, int n[3]) { int main(int argc, char *argv[]) { - if (argc != 3) { - fprintf(stderr, "usage: volume \n"); + static unsigned int test_iterations[] = {3, 7, 1}; + if (argc < 3) { + fprintf(stderr, "usage: volume [ispc iterations] [tasks iterations] [serial iterations]\n"); return 1; } + if (argc == 6) { + for (int i = 0; i < 3; i++) { + test_iterations[i] = atoi(argv[3 + i]); + } + } // // Load viewing data and the volume density data @@ -156,11 +162,12 @@ int main(int argc, char *argv[]) { // time of three runs. // double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[0]; ++i) { reset_and_start_timer(); volume_ispc(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of ISPC run:\t\t\t[%.3f] million cycles\n", dt); minISPC = std::min(minISPC, dt); } @@ -176,11 +183,12 @@ int main(int argc, char *argv[]) { // tasks; report the minimum time of three runs. // double minISPCtasks = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[1]; ++i) { reset_and_start_timer(); volume_ispc_tasks(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of ISPC + TASKS run:\t\t\t[%.3f] million cycles\n", dt); minISPCtasks = std::min(minISPCtasks, dt); } @@ -196,11 +204,12 @@ int main(int argc, char *argv[]) { // minimum time. // double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { + for (int i = 0; i < test_iterations[2]; ++i) { reset_and_start_timer(); volume_serial(density, n, raster2camera, camera2world, width, height, image); double dt = get_elapsed_mcycles(); + printf("@time of serial run:\t\t\t[%.3f] million cycles\n", dt); minSerial = std::min(minSerial, dt); } diff --git a/perf.ini b/perf.ini index b44a2853..eea017de 100755 --- a/perf.ini +++ b/perf.ini @@ -8,26 +8,29 @@ % #*** % [% comment] %**************************************************************************************************** +% All parameters of iteration number must be at the end of command string. Now all of the, are default (3 7 1). AOBench aobench -3 2048 2048 +% --scale= from parameters +2048 2048 #*** Deferred Shading deferred +% --scale= from data and third parameter data/pp1280x720.bin #*** Mandelbrot Set mandelbrot - +--scale=1.0 #*** Mandelbrot Set mandelbrot_tasks - +--scale=8.0 ^ #*** Perlin Noise Function noise - +--scale=4.0 #*** Binomial Options options @@ -45,10 +48,11 @@ sponza --scale=6.0 #*** 3D Stencil stencil - +--scale=2.0 #*** Volume Rendering volume_rendering +% --scale= from data camera.dat density_highres.vol #*** Sort diff --git a/perf.py b/perf.py index d1134990..65895335 100755 --- a/perf.py +++ b/perf.py @@ -99,16 +99,19 @@ def analyse_test(c1, c2, test, b_serial, perf_temp_n): j+=1 if "million cycles" in line: if j == c1: - line = line.replace("]","[") - line = line.split("[") - number = float(line[3]) - if "tasks" in line[1]: - absolute_tasks.append(number) + if line[0] == '@': + print_debug(line, True, perf_log) else: - if "ispc" in line[1]: - absolute_ispc.append(number) - if "serial" in line[1]: - serial.append(number) + line = line.replace("]","[") + line = line.split("[") + number = float(line[3]) + if "tasks" in line[1]: + absolute_tasks.append(number) + else: + if "ispc" in line[1]: + absolute_ispc.append(number) + if "serial" in line[1]: + serial.append(number) if len(ispc) != 0: if len(tasks) != 0: From be21d190e2d35767557382088a3fc3f5547e800a Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 10 Dec 2013 15:26:30 +0400 Subject: [PATCH 14/40] Update of 3.4 branch to rc2 --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 657e67bf..83296b46 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc1" + SVN_PATH="tags/RELEASE_34/rc2" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" From e618b1e14bed17d95d576847dbc4c383a7d3f4dd Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 10 Dec 2013 16:11:48 +0400 Subject: [PATCH 15/40] Removing patch witch already presents in 3.4/rc2 --- .../3_4_r195476_r195779_i16_sext.patch | 57 ------------------- 1 file changed, 57 deletions(-) delete mode 100644 llvm_patches/3_4_r195476_r195779_i16_sext.patch diff --git a/llvm_patches/3_4_r195476_r195779_i16_sext.patch b/llvm_patches/3_4_r195476_r195779_i16_sext.patch deleted file mode 100644 index 4e2c0f6b..00000000 --- a/llvm_patches/3_4_r195476_r195779_i16_sext.patch +++ /dev/null @@ -1,57 +0,0 @@ -Two stability patches affecting sse4-i16x8 and sse4-i8x16 targets. See PR18014 and PR18054 for more details. - -Index: lib/Target/X86/X86ISelLowering.cpp -=================================================================== ---- lib/Target/X86/X86ISelLowering.cpp (revision 195863) -+++ lib/Target/X86/X86ISelLowering.cpp (working copy) -@@ -13120,19 +13120,27 @@ - // fall through - case MVT::v4i32: - case MVT::v8i16: { -- // (sext (vzext x)) -> (vsext x) - SDValue Op0 = Op.getOperand(0); - SDValue Op00 = Op0.getOperand(0); - SDValue Tmp1; - // Hopefully, this VECTOR_SHUFFLE is just a VZEXT. - if (Op0.getOpcode() == ISD::BITCAST && -- Op00.getOpcode() == ISD::VECTOR_SHUFFLE) -+ Op00.getOpcode() == ISD::VECTOR_SHUFFLE) { -+ // (sext (vzext x)) -> (vsext x) - Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG); -- if (Tmp1.getNode()) { -- SDValue Tmp1Op0 = Tmp1.getOperand(0); -- assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && -- "This optimization is invalid without a VZEXT."); -- return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); -+ if (Tmp1.getNode()) { -+ EVT ExtraEltVT = ExtraVT.getVectorElementType(); -+ // This folding is only valid when the in-reg type is a vector of i8, -+ // i16, or i32. -+ if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 || -+ ExtraEltVT == MVT::i32) { -+ SDValue Tmp1Op0 = Tmp1.getOperand(0); -+ assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT && -+ "This optimization is invalid without a VZEXT."); -+ return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0)); -+ } -+ Op0 = Tmp1; -+ } - } - - // If the above didn't work, then just use Shift-Left + Shift-Right. -@@ -17007,6 +17015,15 @@ - if (BitWidth == 1) - return SDValue(); - -+ // Check all uses of that condition operand to check whether it will be -+ // consumed by non-BLEND instructions, which may depend on all bits are set -+ // properly. -+ for (SDNode::use_iterator I = Cond->use_begin(), -+ E = Cond->use_end(); I != E; ++I) -+ if (I->getOpcode() != ISD::VSELECT) -+ // TODO: Add other opcodes eventually lowered into BLEND. -+ return SDValue(); -+ - assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size"); - APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1); - From b19937c4dcb408d246d49ee897fb935674d94998 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 12 Dec 2013 19:25:02 +0400 Subject: [PATCH 16/40] deleting isPrimitiveType() --- cbackend.cpp | 6 +++--- ispc.cpp | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cbackend.cpp b/cbackend.cpp index 40f87074..3db2d504 100644 --- a/cbackend.cpp +++ b/cbackend.cpp @@ -660,7 +660,7 @@ void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out, llvm::raw_ostream & CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned, const std::string &NameSoFar) { - assert((Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) && + assert((Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) && "Invalid type for printSimpleType"); switch (Ty->getTypeID()) { case llvm::Type::VoidTyID: return Out << "void " << NameSoFar; @@ -756,7 +756,7 @@ llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty, #endif ) { - if (Ty->isPrimitiveType() || Ty->isIntegerTy() || Ty->isVectorTy()) { + if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) { printSimpleType(Out, Ty, isSigned, NameSoFar); return Out; } @@ -2737,7 +2737,7 @@ void CWriter::printModuleTypes() { void CWriter::printContainedStructs(llvm::Type *Ty, llvm::SmallPtrSet &Printed) { // Don't walk through pointers. - if (Ty->isPointerTy() || Ty->isPrimitiveType() || Ty->isIntegerTy()) + if (!(Ty->isStructTy() || Ty->isArrayTy())) return; // Print all contained types first. diff --git a/ispc.cpp b/ispc.cpp index b1790dc3..ed326b14 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -944,7 +944,8 @@ Target::GetISATargetString() const { static bool lGenericTypeLayoutIndeterminate(llvm::Type *type) { - if (type->isPrimitiveType() || type->isIntegerTy()) + if (type->isFloatingPointTy() || type->isX86_MMXTy() || type->isVoidTy() || + type->isIntegerTy() || type->isLabelTy() || type->isMetadataTy()) return false; if (type == LLVMTypes::BoolVectorType || From c06ec92d0d79acaa398a9c109baa52525b22cd1d Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 13 Dec 2013 11:49:11 +0100 Subject: [PATCH 17/40] added commas, added multi-dimensional tasking to mandelbrot_tasks & removed mandelbrot_task3d. Also adjusted documentaiton a bit --- builtins/util.m4 | 2 +- docs/ispc.rst | 3 +- examples/mandelbrot_tasks/Makefile | 2 +- .../mandelbrot_tasks/mandelbrot_tasks.cpp | 3 +- .../mandelbrot_tasks/mandelbrot_tasks.ispc | 29 ++- examples/mandelbrot_tasks3d/.gitignore | 2 - examples/mandelbrot_tasks3d/Makefile | 8 - .../mandelbrot_tasks.vcxproj | 180 ------------------ .../mandelbrot_tasks3d/mandelbrot_tasks3d.cpp | 146 -------------- .../mandelbrot_tasks3d.ispc | 99 ---------- .../mandelbrot_tasks_serial.cpp | 68 ------- examples/tasksys.cpp | 2 +- test_static.cpp | 2 +- 13 files changed, 28 insertions(+), 518 deletions(-) delete mode 100644 examples/mandelbrot_tasks3d/.gitignore delete mode 100644 examples/mandelbrot_tasks3d/Makefile delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc delete mode 100644 examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp diff --git a/builtins/util.m4 b/builtins/util.m4 index c90e8adc..1580dc08 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1813,7 +1813,7 @@ define(`stdlib_core', ` declare i32 @__fast_masked_vload() declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32, i32, i32) nounwind declare void @ISPCSync(i8*) nounwind declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind diff --git a/docs/ispc.rst b/docs/ispc.rst index 3aab730b..04f478dc 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -3012,8 +3012,7 @@ Intel® Cilk(tm), Intel® Thread Building Blocks or another task system), and for tasks to use ``ispc`` for SPMD parallelism across the vector lanes as appropriate. Alternatively, ``ispc`` also has support for launching tasks from ``ispc`` code. The approach is similar to Intel® Cilk's task launch -feature. (See the ``examples/mandelbrot_tasks`` and -``examples/mandelbrot_tasks3d`` examples to see it used in a small example.) +feature. (Check the ``examples/mandelbrot_tasks`` example to see how it is used.) Any function that is launched as a task must be declared with the ``task`` qualifier: diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 1a565ffd..a50631ab 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 +ISPC_IA_TARGETS=sse2,sse4-x2,avx ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp index 698daf0f..802afde0 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.cpp +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.cpp @@ -38,7 +38,8 @@ #pragma warning (disable: 4305) #endif -#include +#include +#include #include #include #include "../timing.h" diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc index 84d4ccd4..f9b0be4c 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.ispc +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.ispc @@ -57,21 +57,26 @@ task void mandelbrot_scanline(uniform float x0, uniform float dx, uniform float y0, uniform float dy, uniform int width, uniform int height, - uniform int span, + uniform int xspan, uniform int yspan, uniform int maxIterations, uniform int output[]) { - uniform int ystart = taskIndex * span; - uniform int yend = min((taskIndex+1) * span, (unsigned int)height); + const uniform int xstart = taskIndex0 * xspan; + const uniform int xend = min(xstart + xspan, width); - foreach (yi = ystart ... yend, xi = 0 ... width) { + const uniform int ystart = taskIndex1 * yspan; + const uniform int yend = min(ystart + yspan, height); + + + foreach (yi = ystart ... yend, xi = xstart ... xend) { float x = x0 + xi * dx; float y = y0 + yi * dy; int index = yi * width + xi; output[index] = mandel(x, y, maxIterations); } + } - +#if 1 export void mandelbrot_ispc(uniform float x0, uniform float y0, uniform float x1, uniform float y1, @@ -79,8 +84,16 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform int maxIterations, uniform int output[]) { uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; - uniform int span = 4; + const uniform int xspan = max(32, programCount*2); /* make sure it is big enough to avoid false-sharing */ + const uniform int yspan = 16; - launch[height/span] mandelbrot_scanline(x0, dx, y0, dy, width, height, span, - maxIterations, output); + +#if 1 + launch [width/xspan, height/yspan] +#else + launch [height/yspan][width/xspan] +#endif + mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, + maxIterations, output); } +#endif diff --git a/examples/mandelbrot_tasks3d/.gitignore b/examples/mandelbrot_tasks3d/.gitignore deleted file mode 100644 index c2471c27..00000000 --- a/examples/mandelbrot_tasks3d/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -mandelbrot -*.ppm diff --git a/examples/mandelbrot_tasks3d/Makefile b/examples/mandelbrot_tasks3d/Makefile deleted file mode 100644 index 3dd44d65..00000000 --- a/examples/mandelbrot_tasks3d/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -EXAMPLE=mandelbrot_tasks3d -CPP_SRC=mandelbrot_tasks3d.cpp mandelbrot_tasks_serial.cpp -ISPC_SRC=mandelbrot_tasks3d.ispc -ISPC_IA_TARGETS=avx,sse2,sse4 -ISPC_ARM_TARGETS=neon - -include ../common.mk diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj deleted file mode 100644 index 3a8fca79..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks.vcxproj +++ /dev/null @@ -1,180 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Release - Win32 - - - Release - x64 - - - - {E80DA7D4-AB22-4648-A068-327307156BE6} - Win32Proj - mandelbrot_tasks - - - - Application - true - Unicode - - - Application - true - Unicode - - - Application - false - true - Unicode - - - Application - false - true - Unicode - - - - - - - - - - - - - - - - - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - true - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - false - $(ProjectDir)..\..;$(ExecutablePath) - mandelbrot_tasks - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - - - Level3 - Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - true - Fast - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - $(TargetDir) - Fast - - - Console - true - true - true - - - - - - - - - - Document - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2 - - ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=sse2,sse4-x2,avx-x2 - - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - $(TargetDir)%(Filename).obj;$(TargetDir)%(Filename)_sse2.obj;$(TargetDir)%(Filename)_sse4.obj;$(TargetDir)%(Filename)_avx.obj;$(TargetDir)%(Filename)_ispc.h - - - - - - diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp deleted file mode 100644 index 9cbb966a..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#ifdef _MSC_VER -#define _CRT_SECURE_NO_WARNINGS -#define NOMINMAX -#pragma warning (disable: 4244) -#pragma warning (disable: 4305) -#endif - -#include -#include -#include -#include "../timing.h" -#include "mandelbrot_tasks3d_ispc.h" -using namespace ispc; - -extern void mandelbrot_serial(float x0, float y0, float x1, float y1, - int width, int height, int maxIterations, - int output[]); - -/* Write a PPM image file with the image of the Mandelbrot set */ -static void -writePPM(int *buf, int width, int height, const char *fn) { - FILE *fp = fopen(fn, "wb"); - fprintf(fp, "P6\n"); - fprintf(fp, "%d %d\n", width, height); - fprintf(fp, "255\n"); - for (int i = 0; i < width*height; ++i) { - // Map the iteration count to colors by just alternating between - // two greys. - char c = (buf[i] & 0x1) ? 240 : 20; - for (int j = 0; j < 3; ++j) - fputc(c, fp); - } - fclose(fp); - printf("Wrote image file %s\n", fn); -} - - -static void usage() { - fprintf(stderr, "usage: mandelbrot [--scale=]\n"); - exit(1); -} - -int main(int argc, char *argv[]) { - unsigned int width = 1536; - unsigned int height = 1024; - float x0 = -2; - float x1 = 1; - float y0 = -1; - float y1 = 1; - - if (argc == 1) - ; - else if (argc == 2) { - if (strncmp(argv[1], "--scale=", 8) == 0) { - float scale = atof(argv[1] + 8); - if (scale == 0.f) - usage(); - width *= scale; - height *= scale; - // round up to multiples of 16 - width = (width + 0xf) & ~0xf; - height = (height + 0xf) & ~0xf; - } - else - usage(); - } - else - usage(); - - int maxIterations = 512; - int *buf = new int[width*height]; - - // - // Compute the image using the ispc implementation; report the minimum - // time of three runs. - // - double minISPC = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minISPC = std::min(minISPC, dt); - } - - printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); - writePPM(buf, width, height, "mandelbrot-ispc.ppm"); - - - // - // And run the serial implementation 3 times, again reporting the - // minimum time. - // - double minSerial = 1e30; - for (int i = 0; i < 3; ++i) { - // Clear out the buffer - for (unsigned int i = 0; i < width * height; ++i) - buf[i] = 0; - reset_and_start_timer(); - mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); - double dt = get_elapsed_mcycles(); - minSerial = std::min(minSerial, dt); - } - - printf("[mandelbrot serial]:\t\t[%.3f] million cycles\n", minSerial); - writePPM(buf, width, height, "mandelbrot-serial.ppm"); - - printf("\t\t\t\t(%.2fx speedup from ISPC + tasks)\n", minSerial/minISPC); - - return 0; -} diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc b/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc deleted file mode 100644 index 395bdca4..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks3d.ispc +++ /dev/null @@ -1,99 +0,0 @@ -/* - Copyright (c) 2010-2012, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -static inline int -mandel(float c_re, float c_im, int count) { - float z_re = c_re, z_im = c_im; - int i; - for (i = 0; i < count; ++i) { - if (z_re * z_re + z_im * z_im > 4.) - break; - - float new_re = z_re*z_re - z_im*z_im; - float new_im = 2.f * z_re * z_im; - unmasked { - z_re = c_re + new_re; - z_im = c_im + new_im; - } - } - - return i; -} - - -/* Task to compute the Mandelbrot iterations for a single scanline. - */ -task void -mandelbrot_scanline(uniform float x0, uniform float dx, - uniform float y0, uniform float dy, - uniform int width, uniform int height, - uniform int xspan, uniform int yspan, - uniform int maxIterations, uniform int output[]) { - const uniform int xstart = taskIndex0 * xspan; - const uniform int xend = min(xstart + xspan, width); - - const uniform int ystart = taskIndex1 * yspan; - const uniform int yend = min(ystart + yspan, height); - - - foreach (yi = ystart ... yend, xi = xstart ... xend) { - float x = x0 + xi * dx; - float y = y0 + yi * dy; - - int index = yi * width + xi; - output[index] = mandel(x, y, maxIterations); - } - -} - -#if 1 -export void -mandelbrot_ispc(uniform float x0, uniform float y0, - uniform float x1, uniform float y1, - uniform int width, uniform int height, - uniform int maxIterations, uniform int output[]) { - uniform float dx = (x1 - x0) / width; - uniform float dy = (y1 - y0) / height; - const uniform int xspan = 16; /* make sure it is big enough to avoid false-sharing */ - const uniform int yspan = 16; - - -#if 1 - launch [width/xspan, height/yspan] -#else - launch [height/yspan][width/xspan] -#endif - mandelbrot_scanline(x0, dx, y0, dy, width, height, xspan, yspan, - maxIterations, output); -} -#endif diff --git a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp b/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp deleted file mode 100644 index a76fb5ca..00000000 --- a/examples/mandelbrot_tasks3d/mandelbrot_tasks_serial.cpp +++ /dev/null @@ -1,68 +0,0 @@ -/* - Copyright (c) 2010-2011, Intel Corporation - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are - met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Intel Corporation nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - - -static int mandel(float c_re, float c_im, int count) { - float z_re = c_re, z_im = c_im; - int i; - for (i = 0; i < count; ++i) { - if (z_re * z_re + z_im * z_im > 4.f) - break; - - float new_re = z_re*z_re - z_im*z_im; - float new_im = 2.f * z_re * z_im; - z_re = c_re + new_re; - z_im = c_im + new_im; - } - - return i; -} - -void mandelbrot_serial(float x0, float y0, float x1, float y1, - int width, int height, int maxIterations, - int output[]) -{ - float dx = (x1 - x0) / width; - float dy = (y1 - y0) / height; - - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; ++i) { - float x = x0 + i * dx; - float y = y0 + j * dy; - - int index = (j * width + i); - output[index] = mandel(x, y, maxIterations); - } - } -} - diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index 6bc60129..b914068e 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -204,7 +204,7 @@ struct TaskInfo { // ispc expects these functions to have C linkage / not be mangled extern "C" { - void ISPCLaunch(void **handlePtr, void *f, void *data, int countx,int county, int countz); + void ISPCLaunch(void **handlePtr, void *f, void *data, int countx, int county, int countz); void *ISPCAlloc(void **handlePtr, int64_t size, int32_t alignment); void ISPCSync(void *handle); } diff --git a/test_static.cpp b/test_static.cpp index fceeb64e..27a5b136 100644 --- a/test_static.cpp +++ b/test_static.cpp @@ -69,7 +69,7 @@ extern "C" { void ISPCLaunch(void **handle, void *f, void *d, int count0, int count1, int count2) { *handle = (void *)0xdeadbeef; - typedef void (*TaskFuncType)(void *, int, int, int, int, int,int,int, int,int,int); + typedef void (*TaskFuncType)(void *, int, int, int, int, int, int, int, int, int, int); TaskFuncType func = (TaskFuncType)f; int count = count0*count1*count2, idx = 0; for (int k = 0; k < count2; ++k) From b506c92d21e68fd859f6835a255be43edcf43fd9 Mon Sep 17 00:00:00 2001 From: evghenii Date: Fri, 13 Dec 2013 13:55:58 +0100 Subject: [PATCH 18/40] restored-x2 --- examples/mandelbrot_tasks/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index a50631ab..1a565ffd 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2,sse4-x2,avx +ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2 ISPC_ARM_TARGETS=neon include ../common.mk From b5dc78b06ea8f06405d93ce81a675ea5032ed3db Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Mon, 16 Dec 2013 15:11:13 +0400 Subject: [PATCH 19/40] adding support of shl instruction in lExtractConstantOffset optimization --- opt.cpp | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/opt.cpp b/opt.cpp index 9059c746..c75d4225 100644 --- a/opt.cpp +++ b/opt.cpp @@ -1479,6 +1479,33 @@ lExtractConstantOffset(llvm::Value *vec, llvm::Value **constOffset, insertBefore); return; } + else if (bop->getOpcode() == llvm::Instruction::Shl) { + lExtractConstantOffset(op0, &c0, &v0, insertBefore); + lExtractConstantOffset(op1, &c1, &v1, insertBefore); + + // Given the product of constant and variable terms, we have: + // (c0 + v0) * (2^(c1 + v1)) = c0 * 2^c1 * 2^v1 + v0 * 2^c1 * 2^v1 + // We can optimize only if v1 == NULL. + if ((v1 != NULL) || (c0 == NULL) || (c1 == NULL)) { + *constOffset = NULL; + *variableOffset = vec; + } + else if (v0 == NULL) { + *constOffset = vec; + *variableOffset = NULL; + } + else { + *constOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Shl, c0, c1, + LLVMGetName("shl", c0, c1), + insertBefore); + *variableOffset = + llvm::BinaryOperator::Create(llvm::Instruction::Shl, v0, c1, + LLVMGetName("shl", v0, c1), + insertBefore); + } + return; + } else if (bop->getOpcode() == llvm::Instruction::Mul) { lExtractConstantOffset(op0, &c0, &v0, insertBefore); lExtractConstantOffset(op1, &c1, &v1, insertBefore); From 37f3c0926cbb52b3cfedf178bb87eb7fc46fb8f4 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 17 Dec 2013 17:11:57 +0400 Subject: [PATCH 20/40] Adding missing 3.4 handing in alloy.py (for alloy-build) --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 83296b46..0055842c 100755 --- a/alloy.py +++ b/alloy.py @@ -584,7 +584,7 @@ def Main(): if os.environ.get("SMTP_ISPC") == None: error("you have no SMTP_ISPC in your environment for option notify", 1) if options.only != "": - test_only_r = " 3.1 3.2 3.3 trunk current build stability performance x86 x86-64 -O0 -O2 native " + test_only_r = " 3.1 3.2 3.3 3.4 trunk current build stability performance x86 x86-64 -O0 -O2 native " test_only = options.only.split(" ") for iterator in test_only: if not (" " + iterator + " " in test_only_r): From 63ecf009ecbe6864511a64c6ddb8215aacae50a3 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 17 Dec 2013 15:06:29 +0100 Subject: [PATCH 21/40] fix compilation for Visual Studio --- expr.cpp | 11 ++++++++--- expr.h | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/expr.cpp b/expr.cpp index 60d9ce66..9f75ab08 100644 --- a/expr.cpp +++ b/expr.cpp @@ -3544,9 +3544,14 @@ FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, : Expr(p), isLaunch(il) { func = f; args = a; - launchCountExpr[0] = lce[0]; - launchCountExpr[1] = lce[1]; - launchCountExpr[2] = lce[2]; + if (lce != NULL) + { + launchCountExpr[0] = lce[0]; + launchCountExpr[1] = lce[1]; + launchCountExpr[2] = lce[2]; + } + else + launchCountExpr[0] = launchCountExpr[1] = launchCountExpr[2] = NULL; } diff --git a/expr.h b/expr.h index 0d46191b..e4d7e07b 100644 --- a/expr.h +++ b/expr.h @@ -247,7 +247,7 @@ class FunctionCallExpr : public Expr { public: FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch = false, - Expr *launchCountExpr[3] = (Expr*[3]){NULL, NULL, NULL}); + Expr *launchCountExpr[3] = NULL); llvm::Value *GetValue(FunctionEmitContext *ctx) const; llvm::Value *GetLValue(FunctionEmitContext *ctx) const; From 59b989d243eb47c1135d464da2badabf5eee7ec2 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 17 Dec 2013 16:06:20 +0100 Subject: [PATCH 22/40] fix for --target=sse4-i18x16 --- tests/launch-8.ispc | 16 ++++++++-------- tests/launch-9.ispc | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc index a0b976e4..eacba673 100644 --- a/tests/launch-8.ispc +++ b/tests/launch-8.ispc @@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0]; task void x(const float f) { uniform int j; - assert(taskCount == N0*N1*N2); - assert(taskCount0 == N0); - assert(taskCount1 == N1); - assert(taskCount2 == N2); - assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); - assert(taskIndex0 < N0); - assert(taskIndex1 < N1); - assert(taskIndex2 < N2); + assert(taskCount == (int32)N0*N1*N2); + assert(taskCount0 == (int32)N0); + assert(taskCount1 == (int32)N1); + assert(taskCount2 == (int32)N2); + assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); + assert(taskIndex0 < (int32)N0); + assert(taskIndex1 < (int32)N1); + assert(taskIndex2 < (int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc index 761b070c..1952e8e7 100644 --- a/tests/launch-9.ispc +++ b/tests/launch-9.ispc @@ -10,14 +10,14 @@ static uniform float array[N2][N1][N0]; task void x(const float f) { uniform int j; - assert(taskCount == N0*N1*N2); - assert(taskCount0 == N0); - assert(taskCount1 == N1); - assert(taskCount2 == N2); - assert(taskIndex == taskIndex0 + N0*(taskIndex1 + N1*taskIndex2)); - assert(taskIndex0 < N0); - assert(taskIndex1 < N1); - assert(taskIndex2 < N2); + assert(taskCount == (int32)N0*N1*N2); + assert(taskCount0 == (int32)N0); + assert(taskCount1 == (int32)N1); + assert(taskCount2 == (int32)N2); + assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); + assert(taskIndex0 < (int32)N0); + assert(taskIndex1 < (int32)N1); + assert(taskIndex2 < (int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; From 473f1cb4d2f196e20bed159aef7a041053173f80 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Tue, 10 Dec 2013 20:39:24 +0400 Subject: [PATCH 23/40] packed_store_active2 --- builtins.cpp | 1 + builtins/util.m4 | 45 ++++++++++++++++++++++++++++++++++++++ stdlib.ispc | 13 +++++++++++ tests/packed-store2-1.ispc | 16 ++++++++++++++ tests/packed-store2-2.ispc | 21 ++++++++++++++++++ tests/packed-store2-3.ispc | 17 ++++++++++++++ tests/packed-store2.ispc | 15 +++++++++++++ 7 files changed, 128 insertions(+) create mode 100644 tests/packed-store2-1.ispc create mode 100644 tests/packed-store2-2.ispc create mode 100644 tests/packed-store2-3.ispc create mode 100644 tests/packed-store2.ispc diff --git a/builtins.cpp b/builtins.cpp index 2afd92d9..6be41f13 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -488,6 +488,7 @@ lSetInternalFunctions(llvm::Module *module) { "__num_cores", "__packed_load_active", "__packed_store_active", + "__packed_store_active2", "__popcnt_int32", "__popcnt_int64", "__prefetch_read_uniform_1", diff --git a/builtins/util.m4 b/builtins/util.m4 index e1c9bf97..7ce4ab7f 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -3815,6 +3815,51 @@ loopend: done: ret i32 %nextoffset } + +define MASK @__packed_store_active2(i32 * %startptr, %vals, + %full_mask) nounwind alwaysinline { +entry: + %mask = call i64 @__movmsk( %full_mask) + %mask_known = call i1 @__is_compile_time_constant_mask( %full_mask) + br i1 %mask_known, label %known_mask, label %unknown_mask + +known_mask: + %allon = icmp eq i64 %mask, ALL_ON_MASK + br i1 %allon, label %all_on, label %unknown_mask + +all_on: + %vecptr = bitcast i32 *%startptr to * + store %vals, * %vecptr, align 4 + ret MASK WIDTH + +unknown_mask: + br label %loop + +loop: + %offset = phi MASK [ 0, %unknown_mask ], [ %ch_offset, %loop ] + %i = phi i32 [ 0, %unknown_mask ], [ %ch_i, %loop ] + %storeval = extractelement %vals, i32 %i + +;; Offset has value in range from 0 to WIDTH-1. So it does not matter if we +;; zero or sign extending it, while zero extend is free. Also do nothing for +;; i64 MASK, as we need i64 value. +ifelse(MASK, `i64', +` %storeptr = getelementptr i32 *%startptr, MASK %offset', +` %offset1 = zext MASK %offset to i64 + %storeptr = getelementptr i32 *%startptr, i64 %offset1') + store i32 %storeval, i32 *%storeptr + + %mull_mask = extractelement %full_mask, i32 %i + %ch_offset = sub MASK %offset, %mull_mask + + ; are we done yet? + %ch_i = add i32 %i, 1 + %test = icmp ne i32 %ch_i, WIDTH + br i1 %test, label %loop, label %done + +done: + ret MASK %ch_offset +} ') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/stdlib.ispc b/stdlib.ispc index 6768594b..3b17283d 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -1209,6 +1209,13 @@ packed_store_active(uniform unsigned int a[], return __packed_store_active(a, vals, (UIntMaskType)__mask); } +static inline uniform int +packed_store_active2(uniform unsigned int a[], + unsigned int vals) { + return __packed_store_active2(a, vals, (UIntMaskType)__mask); +} + + static inline uniform int packed_load_active(uniform int a[], varying int * uniform vals) { return __packed_load_active(a, vals, (IntMaskType)__mask); @@ -1219,6 +1226,12 @@ packed_store_active(uniform int a[], int vals) { return __packed_store_active(a, vals, (IntMaskType)__mask); } +static inline uniform int +packed_store_active2(uniform int a[], int vals) { + return __packed_store_active2(a, vals, (IntMaskType)__mask); +} + + /////////////////////////////////////////////////////////////////////////// // System information diff --git a/tests/packed-store2-1.ispc b/tests/packed-store2-1.ispc new file mode 100644 index 00000000..0ca3230a --- /dev/null +++ b/tests/packed-store2-1.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + packed_store_active2(&pack[2], a); + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = programIndex-1; + RET[0] = RET[1] = 0; +} diff --git a/tests/packed-store2-2.ispc b/tests/packed-store2-2.ispc new file mode 100644 index 00000000..c29230ca --- /dev/null +++ b/tests/packed-store2-2.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + uniform int number; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + if ((int)a & 1) + number = packed_store_active2(&pack[2], a); + pack[2+number] = 0; + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + uniform int val = 1; + for (uniform int i = 2; i < 2+programCount/2; ++i, val += 2) + RET[i] = val; +} diff --git a/tests/packed-store2-3.ispc b/tests/packed-store2-3.ispc new file mode 100644 index 00000000..9192525e --- /dev/null +++ b/tests/packed-store2-3.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform int pack[2+programCount]; + for (uniform int i = 0; i < 2+programCount; ++i) + pack[i] = 0; + uniform int count = 0; + if ((int)a & 1) + count += packed_store_active2(&pack[2], a); + RET[programIndex] = count; +} + +export void result(uniform float RET[]) { + RET[programIndex] = (programCount == 1) ? 1 : programCount/2; +} diff --git a/tests/packed-store2.ispc b/tests/packed-store2.ispc new file mode 100644 index 00000000..13973bc3 --- /dev/null +++ b/tests/packed-store2.ispc @@ -0,0 +1,15 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + float a = aFOO[programIndex]; + uniform unsigned int pack[programCount]; + for (uniform int i = 0; i < programCount; ++i) + pack[i] = 0; + packed_store_active2(pack, (unsigned int)a); + RET[programIndex] = pack[programIndex]; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} From 5d1cda986947fa276745b63ac9e3dfc5c5dfb1ce Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 13:05:46 +0400 Subject: [PATCH 24/40] Bumping LLVM 3.4 from rc2 to rc3 in alloy.py --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 0055842c..01ce4453 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc2" + SVN_PATH="tags/RELEASE_34/rc3" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" From bdeaf7e88cce683ea97ed53e44d446ca18fed233 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 16:53:21 +0400 Subject: [PATCH 25/40] Documentation update for overloaded operators and packed_store_active2() --- docs/ispc.rst | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 7e76f433..22e7637b 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -48,6 +48,8 @@ Contents: + `Updating ISPC Programs For Changes In ISPC 1.1`_ + `Updating ISPC Programs For Changes In ISPC 1.2`_ + `Updating ISPC Programs For Changes In ISPC 1.3`_ + + `Updating ISPC Programs For Changes In ISPC 1.5.0`_ + + `Updating ISPC Programs For Changes In ISPC 1.5.1`_ * `Getting Started with ISPC`_ @@ -97,6 +99,9 @@ Contents: * `Short Vector Types`_ * `Array Types`_ * `Struct Types`_ + + + `Operators Overloading`_ + * `Structure of Array Types`_ + `Declarations and Initializers`_ @@ -279,6 +284,15 @@ Double precision floating point constants are floating point number with 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is treated as single precision constant. +Updating ISPC Programs For Changes In ISPC 1.5.1 +------------------------------------------------ + +This release adds support for `Operators Overloading`_, so a word ``operator`` +becomes a keyword and it potentially creates a conflict with existing user +function. Also a new library function packed_store_active2() was introduced, +which also may create a conflict with existing user functions. + + Getting Started with ISPC ========================= @@ -2122,7 +2136,35 @@ above code, the value of ``f[index]`` needs to be able to store a different value of ``Foo::a`` for each program instance. However, a ``varying Foo`` still has only a single ``a`` member, since ``a`` was declared with ``uniform`` variability in the declaration of ``Foo``. Therefore, the -indexing operation in the last line results in an error. +indexing operation in the last line results in an error. + + +Operators Overloading +--------------------- + +ISPC has limited support for overloaded operators for ``struct`` types. Only +binary operators are supported currently, namely they are: ``*, /, %, +, -, >> +and <<``. Operators overloading support is similar to the one in C++ language. +To overload an operator for ``struct S``, you need to declare and implement a +function using keyword ``operator``, which accepts two parameters of type +``struct S`` or ``struct S&`` and returns either of these types. For example: + +:: + + struct S { float re, im;}; + struct S operator*(struct S a, struct S b) { + struct S result; + result.re = a.re * b.re - a.im * b.im; + result.im = a.re * b.im + a.im * b.re; + return result; + } + + void foo(struct S a, struct S b) { + struct S mul = a*b; + print("a.re: %\na.im: %\n", a.re, a.im); + print("b.re: %\nb.im: %\n", b.re, b.im); + print("mul.re: %\nmul.im: %\n", mul.re, mul.im); + } Structure of Array Types @@ -4050,6 +4092,14 @@ They return the total number of values stored. unsigned int val) +There are also ``packed_store_active2()`` functions with exactly the same +signatures and the same semantic except that they may write one extra +element to the output array (but still returning the same value as +``packed_store_active()``). These functions suggest different branch free +implementation on most of supported targets, which usuarly (but not always) +performs better than ``packed_store_active()``. It's advised to test function +performance on user's scenarios on particular target hardware before using it. + As an example of how these functions can be used, the following code shows the use of ``packed_store_active()``. From ca6b3dfa1c513063807133a9ad19f737aeeb6ced Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 16:53:41 +0400 Subject: [PATCH 26/40] Vim syntax support for operators --- contrib/ispc.vim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ispc.vim b/contrib/ispc.vim index f3cb413b..11808658 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -18,6 +18,7 @@ syn keyword ispcConditional cif syn keyword ispcRepeat cdo cfor cwhile syn keyword ispcBuiltin programCount programIndex syn keyword ispcType export uniform varying int8 int16 int32 int64 +syn keyword ispcOperator operator "double precision floating point number, with dot, optional exponent syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" @@ -33,6 +34,7 @@ HiLink ispcConditional Conditional HiLink ispcRepeat Repeat HiLink ispcBuiltin Statement HiLink ispcType Type +HiLink ispcOperator Operator delcommand HiLink let b:current_syntax = "ispc" From 15816eb07e6a8701fc27b078e411d191be972602 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Dec 2013 14:13:55 +0400 Subject: [PATCH 27/40] adding __packed_store_active2 to generic targets --- builtins/target-generic-1.ll | 3 ++- builtins/target-generic-common.ll | 2 ++ examples/intrinsics/generic-16.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-32.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-64.h | 39 ++++++++++++++++++------------- examples/intrinsics/knc-i1x16.h | 20 +++++++++------- examples/intrinsics/knc-i1x8.h | 19 ++++++++------- examples/intrinsics/knc.h | 5 ++++ examples/intrinsics/sse4.h | 24 +++++++++++++++++++ 9 files changed, 125 insertions(+), 65 deletions(-) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 910565dd..c43a12a7 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -3,6 +3,7 @@ define(`MASK',`i32') define(`WIDTH',`1') include(`util.m4') +rdrand_decls() ; Define some basics for a 1-wide target stdlib_core() packed_load_and_store() @@ -655,7 +656,7 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline -declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 92b7a18e..2b2b21c9 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, * nocapture, ) nounwind declare i32 @__packed_store_active(i32 * nocapture, %vals, ) nounwind +declare i32 @__packed_store_active2(i32 * nocapture, %vals, + ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index fa794276..0aa8a3f6 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 16; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec16_i32 *val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec16_i32 val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 531ed215..924b049d 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 32; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec32_i32 *val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec32_i32 val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec32_i32 val, + __vec32_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index bbeb007a..b1451c96 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val, + __vec64_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 64; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec64_i32 *val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *) ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec64_i32 val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *) ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec64_i32 val, + __vec64_i1 mask) { + return __packed_store_active2((int32_t *) ptr, val, mask); } diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ef14d26e..141c47bb 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _ return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); + return __packed_store_active((uint32_t *)p, val, mask); +} + +static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index d7696117..32f39c4a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, __vec8_i1 mask) { - __vec8_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_store_active((uint32_t *)p, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} + #endif /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 8baef8cb..b0782b6e 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1878,6 +1878,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // prefetch /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 919716be..5dd424d9 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val, return count; } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + int count = 0; + + ptr[count] = _mm_extract_epi32(val.v, 0); + count -= _mm_extract_ps(mask.v, 0); + + ptr[count] = _mm_extract_epi32(val.v, 1); + count -= _mm_extract_ps(mask.v, 1); + + ptr[count] = _mm_extract_epi32(val.v, 2); + count -= _mm_extract_ps(mask.v, 2); + + ptr[count] = _mm_extract_epi32(val.v, 3); + count -= _mm_extract_ps(mask.v, 3); + + return count; +} + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val, __vec4_i1 mask) { return __packed_load_active((int32_t *)ptr, val, mask); @@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val, return __packed_store_active((int32_t *)ptr, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // aos/soa From f802164ccedbb3108affce5494893dbe12133407 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 18:18:30 +0400 Subject: [PATCH 28/40] Fixing some typos in docs and adding operators to language description --- docs/ispc.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 22e7637b..9464dcde 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -49,7 +49,7 @@ Contents: + `Updating ISPC Programs For Changes In ISPC 1.2`_ + `Updating ISPC Programs For Changes In ISPC 1.3`_ + `Updating ISPC Programs For Changes In ISPC 1.5.0`_ - + `Updating ISPC Programs For Changes In ISPC 1.5.1`_ + + `Updating ISPC Programs For Changes In ISPC 1.6.0`_ * `Getting Started with ISPC`_ @@ -284,7 +284,7 @@ Double precision floating point constants are floating point number with 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is treated as single precision constant. -Updating ISPC Programs For Changes In ISPC 1.5.1 +Updating ISPC Programs For Changes In ISPC 1.6.0 ------------------------------------------------ This release adds support for `Operators Overloading`_, so a word ``operator`` @@ -1339,6 +1339,7 @@ in C: * Function overloading by parameter type * Hexadecimal floating-point constants * Dynamic memory allocation with ``new`` and ``delete``. +* Limited support for overloaded operators (`Operators Overloading`_). ``ispc`` also adds a number of new features that aren't in C89, C99, or C++: @@ -4096,7 +4097,7 @@ There are also ``packed_store_active2()`` functions with exactly the same signatures and the same semantic except that they may write one extra element to the output array (but still returning the same value as ``packed_store_active()``). These functions suggest different branch free -implementation on most of supported targets, which usuarly (but not always) +implementation on most of supported targets, which usually (but not always) performs better than ``packed_store_active()``. It's advised to test function performance on user's scenarios on particular target hardware before using it. From 5d51f8c7a7380a7ea868dcd4bd31c6a7a3bd44ec Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:05:47 +0400 Subject: [PATCH 29/40] Adding release notes for 1.6.0 --- docs/ReleaseNotes.txt | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index a8575ea0..b7d0bb17 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,47 @@ +=== v1.6.0 === (19 December 2013) + +A major new version of ISPC with major improvements in performance and +stability. Linux and MacOS binaries are based on patched version of LLVM 3.3, +while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves +stability on Win32 platform, so we've decided not to wait for official LLVM 3.4 +release. + +The list of the most significant changes is: + +* New avx1-i32x4 target was added. It may play well for you, if you are focused + on integer computations or FP unit in your hardware is 128 bit wide. + +* Support for calculations in double precision was extended with two new + targets avx1.1-i64x4 and avx2-i64x4. + +* Language support for overloaded operators was added. + +* New library shift() function was added, which is similar to rotate(), but is + non-circular. + +* The language was extended to accept 3 dimensional tasking - a syntactic sugar, + which may facilitate programming of some tasks. + +* Regression, which broke --opt=force-aligned-memory is fixed. + +If you are not using pre-built binaries, you may notice the following changes: + +* VS2012/VS2013 are supported. + +* alloy.py (with -b switch) can build LLVM for you on any platform now + (except MacOS 10.9, but we know about the problem and working on it). + This is a preferred way to build LLVM for ISPC, as all required patches for + better performance and stability will automatically apply. + +* LLVM 3.5 (current trunk) is supported. + +There are also multiple fixes for better performance and stability, most +notable are: + +* Fixed performance problem for x2 targets. + +* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32. + === v1.5.0 === (27 September 2013) A major new version of ISPC with several new targets and important bug fixes. From 7bf64bc4900564c3098b7acd5a90a170d35da626 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Dec 2013 17:57:29 +0400 Subject: [PATCH 30/40] changes in examples (windows) --- examples/aobench/aobench.vcxproj | 18 ++++++++++++ examples/common.props | 28 +++++++++---------- examples/deferred/deferred_shading.vcxproj | 18 ++++++++++++ examples/mandelbrot/mandelbrot.vcxproj | 18 ++++++++++++ .../mandelbrot_tasks/mandelbrot_tasks.vcxproj | 18 ++++++++++++ examples/noise/noise.vcxproj | 18 ++++++++++++ examples/options/options.vcxproj | 18 ++++++++++++ examples/rt/rt.vcxproj | 18 ++++++++++++ examples/sort/sort.vcxproj | 18 ++++++++++++ examples/stencil/stencil.vcxproj | 18 ++++++++++++ examples/tasksys.cpp | 9 +++++- examples/volume_rendering/volume.vcxproj | 18 ++++++++++++ 12 files changed, 202 insertions(+), 15 deletions(-) diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj index c46ee41a..298be2cb 100644 --- a/examples/aobench/aobench.vcxproj +++ b/examples/aobench/aobench.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} Win32Proj diff --git a/examples/common.props b/examples/common.props index 7bf37005..3769330b 100644 --- a/examples/common.props +++ b/examples/common.props @@ -146,24 +146,24 @@ ispc $(default_targets) - $(TargetDir)$(ISPC_file).obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj + $(ISPC_file).obj + $(Target_out);$(ISPC_file)_sse2.obj + $(Target_out);$(ISPC_file)_sse4.obj + $(Target_out);$(ISPC_file)_avx.obj + $(Target_out);$(ISPC_file)_avx11.obj + $(Target_out);$(ISPC_file)_avx2.obj Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj index cd361b26..974e870b 100755 --- a/examples/deferred/deferred_shading.vcxproj +++ b/examples/deferred/deferred_shading.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {87f53c53-957e-4e91-878a-bc27828fb9eb} Win32Proj diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj index e7703ad0..7a5f6e03 100644 --- a/examples/mandelbrot/mandelbrot.vcxproj +++ b/examples/mandelbrot/mandelbrot.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1} Win32Proj diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index f8b8cfcb..113fc4e8 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj index 7adc57f3..ff3953ae 100644 --- a/examples/noise/noise.vcxproj +++ b/examples/noise/noise.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD} Win32Proj diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj index af336aa1..d48ac8bc 100644 --- a/examples/options/options.vcxproj +++ b/examples/options/options.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE} Win32Proj diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj index ea34de56..00b6dd3a 100644 --- a/examples/rt/rt.vcxproj +++ b/examples/rt/rt.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9} Win32Proj diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj index 43f2b439..b0bdc63d 100644 --- a/examples/sort/sort.vcxproj +++ b/examples/sort/sort.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2} Win32Proj diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj index b5f5bb22..fd8564aa 100644 --- a/examples/stencil/stencil.vcxproj +++ b/examples/stencil/stencil.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {2ef070a1-f62f-4e6a-944b-88d140945c3c} Win32Proj diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index cfe0b17b..77269f9f 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -175,6 +175,9 @@ typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task +#ifdef _MSC_VER +__declspec(align(16)) +#endif struct TaskInfo { TaskFuncType func; void *data; @@ -200,7 +203,11 @@ struct TaskInfo { int taskCount1() const { return taskCount3d[1]; } int taskCount2() const { return taskCount3d[2]; } TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } -} __attribute__((aligned(32))); +} +#ifndef _MSC_VER +__attribute__((aligned(32))); +#endif +; // ispc expects these functions to have C linkage / not be mangled extern "C" { diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj index cc738a7e..a1fea5f1 100644 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {dee5733a-e93e-449d-9114-9bffcaeb4df9} Win32Proj From f936269a1e3898c0436b1589a093b956a26222af Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:14:22 +0400 Subject: [PATCH 31/40] News update for 1.6.0 --- docs/news.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/news.rst b/docs/news.rst index 7d78a662..6a805e48 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -2,6 +2,16 @@ ispc News ========= +ispc 1.6.0 is Released +---------------------- + +A major update of ``ispc`` has been released. The main focus is on improved +performance and stability. Several new targets were added. There are also +a number of language and library extensions. Released binaries are based on +patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer +to Release Notes for complete set of changes. + + ispc 1.5.0 is Released ---------------------- From 040605a83c3b92b0a3016c5185d86a37c0d0b35b Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:17:42 +0400 Subject: [PATCH 32/40] Bumping up ispc version to 1.6.0 --- doxygen.cfg | 2 +- ispc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doxygen.cfg b/doxygen.cfg index a0ad3176..a1a0b91b 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.5.1dev +PROJECT_NUMBER = 1.6.0 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index 4b333861..b9b7db17 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.5.1dev" +#define ISPC_VERSION "1.6.0" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported" From 799e476b484d090263260a3202a3473ce85a21c7 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 22:29:02 +0400 Subject: [PATCH 33/40] Bumping ISPC version to 1.6.1dev --- doxygen.cfg | 2 +- ispc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doxygen.cfg b/doxygen.cfg index a1a0b91b..9a8f88e5 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.6.0 +PROJECT_NUMBER = 1.6.1dev # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index b9b7db17..88eb8353 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.6.0" +#define ISPC_VERSION "1.6.1dev" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported" From 9f933b500b59192194e212333f416e90596742d4 Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Fri, 20 Dec 2013 16:45:27 -0500 Subject: [PATCH 34/40] Add missing __cast_sext(__vec16_i32,__vec16_i1) --- examples/intrinsics/knc.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index b0782b6e..0077ad88 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1260,6 +1260,13 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3 return __vec16_i64(val.v, _mm512_setzero_epi32()); } +static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); +} + static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) { __vec16_i32 ret = _mm512_setzero_epi32(); From 949984db185a832acafad2565326c2bdf0def4de Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 23 Dec 2013 16:31:33 +0400 Subject: [PATCH 35/40] Don't do sext+and optimization for generic targets --- opt.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/opt.cpp b/opt.cpp index c75d4225..ff7ee410 100644 --- a/opt.cpp +++ b/opt.cpp @@ -5153,6 +5153,11 @@ FixBooleanSelectPass::runOnFunction(llvm::Function &F) { // LLVM 3.3 only #if defined(LLVM_3_3) + // Don't optimize generic targets. + if (g->target->getISA() == Target::GENERIC) { + return false; + } + for (llvm::Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { llvm::BasicBlock* bb = &*I; From 34a588511f53517d9ee9a559f1fef492f2c378f5 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 24 Dec 2013 18:38:25 +0400 Subject: [PATCH 36/40] Checkout and install with clang standard library headers on MacOS 10.9 --- alloy.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/alloy.py b/alloy.py index 01ce4453..ee7a1acf 100755 --- a/alloy.py +++ b/alloy.py @@ -129,8 +129,23 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ", "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang", from_validation) + os.chdir("..") + if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13: + # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and + # the default library is libc++, bit libstdc++. The headers are part of XCode now. But we are checking out + # headers as part of LLVM source tree, so they will be installed in clang location and clang will be able + # to find them. Though they may not match to the library installed in the system, but seems that this should + # not happen. + # Note, that we can also build a libc++ library, but it must be on system default location or should be passed + # to the linker explicitly (either through command line or environment variables). So we are not doing it + # currently to make the build process easier. + os.chdir("projects") + try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx", + from_validation) + os.chdir("..") if extra == True: - os.chdir("./clang/tools") + os.chdir("tools/clang/tools") try_do_LLVM("load extra clang extra tools ", "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra", from_validation) @@ -138,7 +153,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load extra clang compiler-rt ", "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt", from_validation) - os.chdir("../") + os.chdir("..") else: tar = tarball.split(" ") os.makedirs(LLVM_SRC) @@ -563,6 +578,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, def Main(): global current_OS + global current_OS_version + current_OS_version = platform.release() if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: current_OS = "Windows" else: From a69c4527a1b30a783b4fc90d617f56c74fd7d6af Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 24 Dec 2013 18:39:17 +0400 Subject: [PATCH 37/40] Bumping up 3.4 version from rc3 to final --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index ee7a1acf..57d8df1e 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc3" + SVN_PATH="tags/RELEASE_34/final" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" From 5cfd773ec9f3d666fa0a7cfaf29e13507f3ae39e Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Wed, 20 Mar 2013 12:34:08 +0400 Subject: [PATCH 38/40] Adding Alias Analysis phases --- opt.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/opt.cpp b/opt.cpp index ff7ee410..3b51e5fb 100644 --- a/opt.cpp +++ b/opt.cpp @@ -514,6 +514,18 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeInstrumentation(*registry); llvm::initializeTarget(*registry); + // Setup to use LLVM default AliasAnalysis + // Ideally, we want call: + // llvm::PassManagerBuilder pm_Builder; + // pm_Builder.OptLevel = optLevel; + // pm_Builder.addInitialAliasAnalysisPasses(optPM); + // but the addInitialAliasAnalysisPasses() is a private function + // so we explicitly enable them here. + // Need to keep sync with future LLVM change + // An alternative is to call populateFunctionPassManager() + optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190); + optPM.add(llvm::createBasicAliasAnalysisPass()); + optPM.add(llvm::createGlobalDCEPass(), 200); // Early optimizations to try to reduce the total amount of code to From e34f0cc2509ce8e84a022613e37d50d25349cf6d Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 26 Dec 2013 14:23:46 +0400 Subject: [PATCH 39/40] correction of checking tools in testing environment --- common.py | 3 +++ 1 file changed, 3 insertions(+) mode change 100644 => 100755 common.py diff --git a/common.py b/common.py old mode 100644 new mode 100755 index be3e9526..2a788722 --- a/common.py +++ b/common.py @@ -121,4 +121,7 @@ def check_tools(m): if int(t11[j])input_tools[t][0][j]: + break return ret From 4ef38e16153a09b1d2c130b9791bdfe86d773ec9 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Fri, 27 Dec 2013 18:52:23 +0400 Subject: [PATCH 40/40] Adding some optimization passes between two Alias Analysis passes --- opt.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/opt.cpp b/opt.cpp index 3b51e5fb..f70e522d 100644 --- a/opt.cpp +++ b/opt.cpp @@ -514,6 +514,8 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeInstrumentation(*registry); llvm::initializeTarget(*registry); + optPM.add(llvm::createGlobalDCEPass(), 185); + // Setup to use LLVM default AliasAnalysis // Ideally, we want call: // llvm::PassManagerBuilder pm_Builder; @@ -525,12 +527,18 @@ Optimize(llvm::Module *module, int optLevel) { // An alternative is to call populateFunctionPassManager() optPM.add(llvm::createTypeBasedAliasAnalysisPass(), 190); optPM.add(llvm::createBasicAliasAnalysisPass()); - - optPM.add(llvm::createGlobalDCEPass(), 200); + optPM.add(llvm::createCFGSimplificationPass()); + // Here clang has an experimental pass SROAPass instead of + // ScalarReplAggregatesPass. We should add it in the future. + optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createEarlyCSEPass()); + optPM.add(llvm::createLowerExpectIntrinsicPass()); + optPM.add(llvm::createTypeBasedAliasAnalysisPass()); + optPM.add(llvm::createBasicAliasAnalysisPass()); // Early optimizations to try to reduce the total amount of code to // work with if we can - optPM.add(llvm::createReassociatePass()); + optPM.add(llvm::createReassociatePass(), 200); optPM.add(llvm::createConstantPropagationPass()); optPM.add(llvm::createDeadInstEliminationPass()); optPM.add(llvm::createCFGSimplificationPass());