MAJOR CHANGE--- STOP WITH THIS BRANCH--

This commit is contained in:
Evghenii
2014-01-06 13:51:02 +01:00
parent 77113fbffd
commit 546f9cb409
13 changed files with 710 additions and 1151 deletions

View File

@@ -144,7 +144,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
type.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=nvptx64 avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
TARGETS=nvptx avx2-i64x4 avx11-i64x4 avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
ifneq ($(ARM_ENABLED), 0)
@@ -254,15 +254,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
@echo Compiling $<
$(CXX) $(CXXFLAGS) -o $@ -c $<
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $<
m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(32 bit version\)
m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util_ptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/util-nvptx.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
@echo Creating C++ source from builtins definition file $< \(64 bit version\)
m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@

View File

@@ -693,9 +693,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
if (g->target->getISA() != Target::NEON32 &&
g->target->getISA() != Target::NEON16 &&
g->target->getISA() != Target::NEON8 &&
g->target->getISA() != Target::NVPTX64)
g->target->getISA() != Target::NVPTX)
#else
if (g->target->getISA() != Target::NVPTX64)
if (g->target->getISA() != Target::NVPTX)
#endif // !__arm__
{
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -858,14 +858,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// Next, add the target's custom implementations of the various needed
// builtin functions (e.g. __masked_store_32(), etc).
switch (g->target->getISA()) {
case Target::NVPTX64:
case Target::NVPTX:
{
if (runtime32) {
fprintf(stderr, "W're sorry, but only 64bit targets are supported at this moment .. \n");
fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
assert(0);
}
else {
EXPORT_MODULE(builtins_bitcode_nvptx64_64bit);
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
}
break;
};
@@ -1138,7 +1138,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
// define the 'programCount' builtin variable
if (!g->target->isPTX())
if (g->target->getISA() != Target::NVPTX)
{
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
}

View File

@@ -105,15 +105,9 @@ define i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; tasking
define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
{
%ptr = inttoptr i64 1 to i8*
ret i8* %ptr
}
;; this call allocate parameter buffer for kernel launch
declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline
define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline
{
entry:
%call = tail call i32 @__tid_x()
@@ -121,6 +115,7 @@ entry:
%sub = add nsw i32 %call1, -1
%and = and i32 %sub, %call
%cmp = icmp eq i32 %and, 0
%align = zext i32 %align32 to i64
br i1 %cmp, label %if.then, label %if.end
if.then:
@@ -224,7 +219,7 @@ define void @ISPCSync(i8*) nounwind alwaysinline
include(`util_ptx.m4')
include(`util-nvptx.m4')
stdlib_core()
packed_load_and_store()

85
ctx.cpp
View File

@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
llvm::Value *
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
if (!g->target->isPTX()) //g->target->getISA() != Target::NVPTX64)
if (g->target->getISA() != Target::NVPTX)
{
llvm::SmallVector<llvm::Constant*, 16> array;
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
@@ -3540,7 +3540,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
std::vector<llvm::Value *> &argVals,
llvm::Value *launchCount[3]){
if (!g->target->isPTX())
if (g->target->getISA() != Target::NVPTX)
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
@@ -3608,7 +3608,79 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
args.push_back(launchCount[2]);
return CallInst(flaunch, NULL, args, "");
}
else /* isPTX == true */
else /* NVPTX */
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
return NULL;
}
launchedTasks = true;
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
std::vector<llvm::Type*> argTypes;
for (unsigned int i = 0; i < argVals.size(); i++)
argTypes.push_back(argVals[i]->getType());
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
const int align = 8;
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
std::vector<llvm::Value *> allocArgs;
allocArgs.push_back(launchGroupHandlePtr);
allocArgs.push_back(structSize);
allocArgs.push_back(LLVMInt32(align));
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
/* check if the pointer returned by ISPCAlloc is not NULL
* --------------
* this is a workaround for not checking the value of programIndex
* because ISPCAlloc will return NULL pointer for all programIndex > 0
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
* will also be NULL
* This check must be added, and also rewrite the code to make it less opaque
*/
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
BranchInst(if_true, if_false, cmp1);
/**********************/
bblock = if_true;
// label_if_then block:
llvm::Type *pt = llvm::PointerType::getUnqual(st);
llvm::Value *argmem = BitCastInst(voidmem, pt);
for (unsigned int i = 0; i < argVals.size(); ++i)
{
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
}
BranchInst(if_false);
/**********************/
bblock = if_false;
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
AssertPos(currentPos, flaunch != NULL);
std::vector<llvm::Value *> args;
args.push_back(launchGroupHandlePtr);
args.push_back(fptr);
args.push_back(voidmem);
args.push_back(launchCount[0]);
args.push_back(launchCount[1]);
args.push_back(launchCount[2]);
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
return ret;
}
#if 0
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
@@ -3684,13 +3756,16 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
args.push_back(launchCount[2]);
return CallInst(flaunch, NULL, args, "");
}
#endif
}
void
FunctionEmitContext::SyncInst() {
if (!g->target->isPTX())
#if 0
if (g->target->getISA() != Target::NVPTX)
{
#endif
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
@@ -3714,6 +3789,7 @@ FunctionEmitContext::SyncInst() {
BranchInst(bPostSync);
SetCurrentBasicBlock(bPostSync);
#if 0
}
else
{
@@ -3726,6 +3802,7 @@ FunctionEmitContext::SyncInst() {
CallInst(fsync, NULL, launchGroupHandle, "");
StoreInst(nullPtrValue, launchGroupHandlePtr);
}
#endif
}

View File

@@ -531,7 +531,7 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
if (isTask && g->target->isPTX()) //getISA() == Target::NVPTX64)
if (isTask && g->target->getISA() == Target::NVPTX)
{
// ds->storageClass = SC_EXTERN_C;
ds->typeQualifiers |= TYPEQUAL_UNMASKED;
@@ -547,7 +547,6 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
"qualifiers");
return;
}
// if (!g->target->isPTX())
if (isExternC && isTask) {
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
"qualifiers");

View File

@@ -22,7 +22,7 @@ endif
#
ISPC=ispc
ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math
ISPC_FLAGS=-O3 --math-lib=default --target=nvptx --opt=fast-math
#
#
#

141
func.cpp
View File

@@ -125,7 +125,7 @@ Function::Function(Symbol *s, Stmt *c) {
sym->parentFunction = this;
}
if (type->isTask) {
if (type->isTask && g->target->getISA() != Target::NVPTX) {
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
Assert(threadIndexSym);
threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -237,12 +237,122 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
#endif
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isTask == true && g->target->getISA() != Target::NVPTX) {
// For tasks, we there should always be three parmeters: the
// pointer to the structure that holds all of the arguments, the
// thread index, and the thread count variables.
llvm::Function::arg_iterator argIter = function->arg_begin();
llvm::Value *structParamPtr = argIter++;
llvm::Value *threadIndex = argIter++;
llvm::Value *threadCount = argIter++;
llvm::Value *taskIndex = argIter++;
llvm::Value *taskCount = argIter++;
llvm::Value *taskIndex0 = argIter++;
llvm::Value *taskIndex1 = argIter++;
llvm::Value *taskIndex2 = argIter++;
llvm::Value *taskCount0 = argIter++;
llvm::Value *taskCount1 = argIter++;
llvm::Value *taskCount2 = argIter++;
// Copy the function parameter values from the structure into local
// storage
for (unsigned int i = 0; i < args.size(); ++i)
lCopyInTaskParameter(i, structParamPtr, args, ctx);
if (type->isUnmasked == false) {
// Copy in the mask as well.
int nArgs = (int)args.size();
// The mask is the last parameter in the argument structure
llvm::Value *ptr = ctx->AddElementOffset(structParamPtr, nArgs, NULL,
"task_struct_mask");
llvm::Value *ptrval = ctx->LoadInst(ptr, "mask");
ctx->SetFunctionMask(ptrval);
}
// Copy threadIndex and threadCount into stack-allocated storage so
// that their symbols point to something reasonable.
threadIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadIndex");
ctx->StoreInst(threadIndex, threadIndexSym->storagePtr);
threadCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "threadCount");
ctx->StoreInst(threadCount, threadCountSym->storagePtr);
// Copy taskIndex and taskCount into stack-allocated storage so
// that their symbols point to something reasonable.
taskIndexSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex");
ctx->StoreInst(taskIndex, taskIndexSym->storagePtr);
taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount");
ctx->StoreInst(taskCount, taskCountSym->storagePtr);
taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0");
ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr);
taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1");
ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr);
taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2");
ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr);
taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0");
ctx->StoreInst(taskCount0, taskCountSym0->storagePtr);
taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1");
ctx->StoreInst(taskCount1, taskCountSym1->storagePtr);
taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2");
ctx->StoreInst(taskCount2, taskCountSym2->storagePtr);
}
else {
// Regular, non-task function
llvm::Function::arg_iterator argIter = function->arg_begin();
for (unsigned int i = 0; i < args.size(); ++i, ++argIter) {
Symbol *sym = args[i];
if (sym == NULL)
// anonymous function parameter
continue;
argIter->setName(sym->name.c_str());
// Allocate stack storage for the parameter and emit code
// to store the its value there.
sym->storagePtr = ctx->AllocaInst(argIter->getType(), sym->name.c_str());
ctx->StoreInst(argIter, sym->storagePtr);
ctx->EmitFunctionParameterDebugInfo(sym, i);
}
// If the number of actual function arguments is equal to the
// number of declared arguments in decl->functionParams, then we
// don't have a mask parameter, so set it to be all on. This
// happens for exmaple with 'export'ed functions that the app
// calls.
if (argIter == function->arg_end()) {
Assert(type->isUnmasked || type->isExported);
ctx->SetFunctionMask(LLVMMaskAllOn);
}
else {
Assert(type->isUnmasked == false);
// Otherwise use the mask to set the entry mask value
argIter->setName("__mask");
Assert(argIter->getType() == LLVMTypes::MaskType);
ctx->SetFunctionMask(argIter);
Assert(++argIter == function->arg_end());
}
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(function);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(LLVMInt32(1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
}
}
#if 0
if (type->isTask == true) {
// For tasks, we there should always be three parmeters: the
// pointer to the structure that holds all of the arguments, the
// thread index, and the thread count variables.
if (!g->target->isPTX()) //if (g->target->getISA() != Target::NVPTX64)
if (g->target->getISA() != Target::NVPTX)
{
llvm::Function::arg_iterator argIter = function->arg_begin();
llvm::Value *structParamPtr = argIter++;
@@ -341,7 +451,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
Assert(type->isUnmasked || type->isExported);
ctx->SetFunctionMask(LLVMMaskAllOn);
}
else /* for NVPTX64 , function must be unmasked */
else /* for NVPTX, function must be unmasked */
{
assert(0);
Assert(type->isUnmasked == false);
@@ -353,7 +463,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
Assert(++argIter == function->arg_end());
}
if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
@@ -402,6 +512,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
Assert(++argIter == function->arg_end());
}
}
#endif
// Finally, we can generate code for the function
if (code != NULL) {
@@ -535,26 +646,12 @@ Function::GenerateIR() {
}
// And we can now go ahead and emit the code
/* export function with NVPTX64 target should be emitted host architecture */
#if 0
const FunctionType *func_type= CastType<FunctionType>(sym->type);
if (g->target->getISA() == Target::NVPTX64 && func_type->isExported)
return;
#endif
#if 0
if (g->target->getISA() != Target::NVPTX64 && g->target->isPTX() && func_type->isTask)
return;
#endif
// if (!(g->target->getISA()==Target::NVPTX64 && func_type->isExported))
{
FunctionEmitContext ec(this, sym, function, firstStmtPos);
emitCode(&ec, function, firstStmtPos);
}
if (m->errorCount == 0) {
// if (!(g->target->getISA() == Target::NVPTX64 && func_type->isExported))
if (llvm::verifyFunction(*function, llvm::ReturnStatusAction) == true) {
if (g->debugPrint)
function->dump();
@@ -566,9 +663,9 @@ Function::GenerateIR() {
// the application can call it
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isExported) { // && g->target->getISA() != Target::VPTX64) {
if (type->isExported) {
if (!type->isTask) {
if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
@@ -585,7 +682,7 @@ Function::GenerateIR() {
if (g->mangleFunctionsWithTarget)
functionName += std::string("_") + g->target->GetISAString();
if (g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
functionName += std::string("___export");
llvm::Function *appFunction =
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
@@ -615,7 +712,7 @@ Function::GenerateIR() {
FATAL("Function verificication failed");
}
}
if (g->target->isPTX() && g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");

View File

@@ -174,7 +174,7 @@ static const char *supportedCPUs[] = {
#endif // LLVM 3.4+
};
Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX) :
Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
m_target(NULL),
m_targetMachine(NULL),
#if defined(LLVM_3_1)
@@ -184,7 +184,6 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
#endif
m_valid(false),
m_isa(SSE2),
m_isPTX(isPTX),
m_arch(""),
m_is32Bit(true),
m_cpu(""),
@@ -212,7 +211,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
if (!strcmp(cpu, "core-avx2"))
isa = "avx2-i32x8";
else if (!strcmp(cpu, "sm_35"))
isa = "nvptx64";
isa = "nvptx";
#ifdef ISPC_ARM_ENABLED
else if (!strcmp(cpu, "cortex-a9") ||
!strcmp(cpu, "cortex-a15"))
@@ -249,7 +248,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
cpu = "cortex-a9";
#endif
if (cpu == NULL && !strcmp(isa, "nvptx64"))
if (cpu == NULL && !strcmp(isa, "nvptx"))
cpu = "sm_35";
if (cpu == NULL) {
@@ -280,8 +279,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
this->m_cpu = cpu;
if (arch == NULL) {
if (!strcmp(isa, "nvptx64"))
arch = "nvptx64";
if (!strcmp(isa, "nvptx"))
arch = "nvptx";
#ifdef ISPC_ARM_ENABLED
else if (!strncmp(isa, "neon", 4))
arch = "arm";
@@ -709,10 +708,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
this->m_maskBitCount = 32;
}
#endif
else if (!strcasecmp(isa, "nvptx64"))
else if (!strcasecmp(isa, "nvptx"))
{
this->m_isa = Target::NVPTX64;
this->m_isPTX = true;
this->m_isa = Target::NVPTX;
this->m_nativeVectorWidth = 32;
this->m_nativeVectorAlignment = 32;
this->m_vectorWidth = 1;
@@ -780,7 +778,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-"
"i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-"
"f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32-v4:128:128";
} else if (m_isa == Target::NVPTX64)
} else if (m_isa == Target::NVPTX)
{
dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
}
@@ -803,7 +801,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic, boo
// Initialize target-specific "target-feature" attribute.
if (!m_attributes.empty()) {
llvm::AttrBuilder attrBuilder;
if (m_isa != Target::NVPTX64)
if (m_isa != Target::NVPTX)
attrBuilder.addAttribute("target-cpu", this->m_cpu);
attrBuilder.addAttribute("target-features", this->m_attributes);
this->m_tf_attributes = new llvm::AttributeSet(
@@ -838,7 +836,7 @@ Target::SupportedCPUs() {
const char *
Target::SupportedArchs() {
return "nvptx64, "
return "nvptx, "
#ifdef ISPC_ARM_ENABLED
"arm, "
#endif
@@ -848,7 +846,7 @@ Target::SupportedArchs() {
const char *
Target::SupportedTargets() {
return "nvptx64, "
return "nvptx, "
#ifdef ISPC_ARM_ENABLED
"neon-i8x16, neon-i16x8, neon-i32x4, "
#endif
@@ -866,9 +864,9 @@ Target::SupportedTargets() {
std::string
Target::GetTripleString() const {
llvm::Triple triple;
if (m_arch == "nvptx64")
if (m_arch == "nvptx")
{
triple.setTriple("nvptx64");
triple.setTriple("nvptx");
}
#ifdef ISPC_ARM_ENABLED
else if (m_arch == "arm") {
@@ -902,8 +900,8 @@ Target::GetTripleString() const {
const char *
Target::ISAToString(ISA isa) {
switch (isa) {
case Target::NVPTX64:
return "nvptx64";
case Target::NVPTX:
return "nvptx";
#ifdef ISPC_ARM_ENABLED
case Target::NEON8:
return "neon-8";

6
ispc.h
View File

@@ -179,7 +179,7 @@ public:
flexible/performant of them will apear last in the enumerant. Note
also that __best_available_isa() needs to be updated if ISAs are
added or the enumerant values are reordered. */
enum ISA { NVPTX64,
enum ISA { NVPTX,
#ifdef ISPC_ARM_ENABLED
NEON32, NEON16, NEON8,
#endif
@@ -189,7 +189,7 @@ public:
/** Initializes the given Target pointer for a target of the given
name, if the name is a known target. Returns true if the
target was initialized and false if the name is unknown. */
Target(const char *arch, const char *cpu, const char *isa, bool pic, bool isPTX = false);
Target(const char *arch, const char *cpu, const char *isa, bool pic);
/** Returns a comma-delimited string giving the names of the currently
supported compilation targets. */
@@ -251,7 +251,6 @@ public:
bool isValid() const {return m_valid;}
ISA getISA() const {return m_isa;}
bool isPTX() const {return m_isPTX;}
std::string getArch() const {return m_arch;}
@@ -310,7 +309,6 @@ private:
/** Instruction set being compiled to. */
ISA m_isa;
bool m_isPTX;
/** Target system architecture. (e.g. "x86-64", "x86"). */
std::string m_arch;

View File

@@ -733,7 +733,7 @@ Module::AddFunctionDeclaration(const std::string &name,
if (storageClass == SC_EXTERN_C) {
// Make sure the user hasn't supplied both an 'extern "C"' and a
// 'task' qualifier with the function
if (functionType->isTask) //&& !g->target->isPTX()) //tISA() != Target::NVPTX64)
if (functionType->isTask)
{
Error(pos, "\"task\" qualifier is illegal with C-linkage extern "
"function \"%s\". Ignoring this function.", name.c_str());
@@ -796,8 +796,8 @@ Module::AddFunctionDeclaration(const std::string &name,
#else // LLVM 3.1 and 3.3+
function->addFnAttr(llvm::Attribute::AlwaysInline);
#endif
/* evghenii: on PTX target this must not be used, cause crash, dunno why */
if (functionType->isTask && g->target->getISA() != Target::NVPTX64)
/* evghenii: on PTX target the following must not be set ... why ?!? */
if (functionType->isTask && g->target->getISA() != Target::NVPTX)
// This also applies transitively to members I think?
#if defined(LLVM_3_1)
function->setDoesNotAlias(1, true);
@@ -953,7 +953,7 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
const char *fileType = NULL;
switch (outputType) {
case Asm:
if (g->target->getISA() != Target::NVPTX64)
if (g->target->getISA() != Target::NVPTX)
{
if (strcasecmp(suffix, "s"))
fileType = "assembly";
@@ -1053,7 +1053,7 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
}
llvm::raw_fd_ostream fos(fd, (fd != 1), false);
if (g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
{
const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
module->setDataLayout(dl_string);
@@ -1925,7 +1925,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
opts.addMacroDef(g->cppArgs[i].substr(2));
}
}
if (g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
{
opts.addMacroDef("__NVPTX__");
opts.addMacroDef("programIndex=laneIndex()");
@@ -2331,135 +2331,9 @@ Module::CompileAndOutput(const char *srcFile,
const char *hostStubFileName,
const char *devStubFileName)
{
char ptxname[] = "nvptx64";
for (int k = 0; k < 7; k++)
ptxname[k] = target[k];
if (0) //target != NULL && strcmp(ptxname,"nvptx64") == 0) // NVPTX64
{
std::vector<std::string> targets = lExtractTargets(target);
Assert(targets.size() > 1);
if (target == NULL || strchr(target, ',') == NULL) {
// We're only compiling to a single target
int errorCount = 0;
const char *suffix_orig = strrchr(outFileName, '.');
++suffix_orig;
assert(suffix_orig!=NULL);
g->PtxString = std::string();
for (int itarget = 0; itarget < 1; itarget++)
{
fprintf(stderr, "compiling nvptx64 : target= %s\n",targets[itarget].c_str());
g->target = new Target(arch, cpu, targets[itarget].c_str(), generatePIC, /* isPTX= */ true);
if (!g->target->isValid())
return 1;
m = new Module(srcFile);
if (m->CompileFile() == 0) {
if (outputType == CXX) {
if (target == NULL || strncmp(target, "generic-", 8) != 0) {
Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
"targets must be used.");
return 1;
}
}
else if (outputType == Asm || outputType == Object) {
if (target != NULL && strncmp(target, "generic-", 8) == 0) {
Error(SourcePos(), "When using a \"generic-*\" compilation target, "
"%s output can not be used.",
(outputType == Asm) ? "assembly" : "object file");
return 1;
}
}
assert(outFileName != NULL);
std::string targetOutFileName =
lGetTargetFileName(outFileName, targets[itarget].c_str());
if (outputType == Asm)
{
const char * targetOutFileName_c = targetOutFileName.c_str();
const int suffix = strrchr(targetOutFileName_c, '.') - targetOutFileName_c + 1;
if (itarget == 1 && !strcasecmp(suffix_orig, "ptx"))
{
targetOutFileName[suffix ] = 's';
targetOutFileName[suffix+1] = 0;
}
}
if (outputType != Object)
{
if (!m->writeOutput(outputType, targetOutFileName.c_str(), includeFileName))
return 1;
}
else if (itarget > 0)
{
if (!m->writeOutput(outputType, outFileName, includeFileName))
return 1;
}
if (itarget == 0)
{ /* store ptx into memory */
llvm::PassManager pm;
#if defined(LLVM_3_1)
pm.add(new llvm::TargetData(*g->target->getDataLayout()));
#else
pm.add(new llvm::DataLayout(*g->target->getDataLayout()));
#endif
llvm::raw_string_ostream rso(g->PtxString);
llvm::formatted_raw_ostream fos(rso);
llvm::TargetMachine::CodeGenFileType fileType = llvm::TargetMachine::CGFT_AssemblyFile;
llvm::TargetMachine *targetMachine = g->target->GetTargetMachine();
if (targetMachine->addPassesToEmitFile(pm, fos, fileType)) {
fprintf(stderr, "Fatal error adding passes to emit object file!");
exit(1);
}
llvm::Module *module = m->module;
pm.run(*module);
fos.flush();
assert(!g->PtxString.empty());
#if 0
std::cout << g->PtxString << std::endl;
#endif
}
if (itarget > 0)
{
if (headerFileName != NULL)
if (!m->writeOutput(Module::Header, headerFileName))
return 1;
if (depsFileName != NULL)
if (!m->writeOutput(Module::Deps,depsFileName))
return 1;
if (hostStubFileName != NULL)
if (!m->writeOutput(Module::HostStub,hostStubFileName))
return 1;
if (devStubFileName != NULL)
if (!m->writeOutput(Module::DevStub,devStubFileName))
return 1;
}
}
else
++m->errorCount;
errorCount += m->errorCount;
delete m;
m = NULL;
delete g->target;
g->target = NULL;
}
return errorCount > 0;
}
else if (target == NULL || strchr(target, ',') == NULL) {
// We're only compiling to a single target
const bool isPTX = strcmp(target, "nvptx64") == 0;
g->target = new Target(arch, cpu, target, generatePIC, isPTX);
g->target = new Target(arch, cpu, target, generatePIC);
if (!g->target->isValid())
return 1;
@@ -2525,8 +2399,6 @@ Module::CompileAndOutput(const char *srcFile,
// The user supplied multiple targets
std::vector<std::string> targets = lExtractTargets(target);
Assert(targets.size() > 1);
for (unsigned int i = 0; i < targets.size(); ++i)
assert(strcmp(targets[i].c_str(), "nvptx64") < 0);
if (outFileName != NULL && strcmp(outFileName, "-") == 0) {
Error(SourcePos(), "Multi-target compilation can't generate output "

473
stmt.cpp
View File

@@ -206,7 +206,7 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
}
if (sym->storageClass == SC_STATIC) {
if (g->target->getISA() == Target::NVPTX64)
if (g->target->getISA() == Target::NVPTX)
if (!sym->type->IsConstType())
Error(initExpr->pos, "Non-constant static variable ""\"%s\" is not supported with ""\"cuda\" target.",
sym->name.c_str());
@@ -1280,7 +1280,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
llvm::Value *varyingCounterPtr,
const std::vector<int> &spans)
{
if (!g->target->isPTX())
if (g->target->getISA() != Target::NVPTX)
{
// Smear the uniform counter value out to be varying
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1315,7 +1315,7 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
ctx->StoreInst(varyingCounter, varyingCounterPtr);
return varyingCounter;
}
else /* isPTX() == true */
else /* NVPTX == true */
{
// Smear the uniform counter value out to be varying
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
@@ -1465,8 +1465,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
if (ctx->GetCurrentBasicBlock() == NULL || stmts == NULL)
return;
if (!g->target->isPTX())
{
llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
@@ -1493,469 +1491,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
std::vector<int> span(nDims, 0);
lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
for (int i = 0; i < nDims; ++i) {
// Basic blocks that we'll fill in later with the looping logic for
// this dimension.
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
if (i < nDims-1)
// stepping for the innermost dimension is handled specially
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
// Start and end value for this loop dimension
llvm::Value *sv = startExprs[i]->GetValue(ctx);
llvm::Value *ev = endExprs[i]->GetValue(ctx);
if (sv == NULL || ev == NULL)
return;
startVals.push_back(sv);
endVals.push_back(ev);
// nItems = endVal - startVal
llvm::Value *nItems =
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
// nExtras = nItems % (span for this dimension)
// This gives us the number of extra elements we need to deal with
// at the end of the loop for this dimension that don't fit cleanly
// into a vector width.
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
LLVMInt32(span[i]), "nextras"));
// alignedEnd = endVal - nExtras
alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
nExtras[i], "aligned_end"));
///////////////////////////////////////////////////////////////////////
// Each dimension has a loop counter that is a uniform value that
// goes from startVal to endVal, in steps of the span for this
// dimension. Its value is only used internally here for looping
// logic and isn't directly available in the user's program code.
uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
"counter"));
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
// There is also a varying variable that holds the set of index
// values for each dimension in the current loop iteration; this is
// the value that is program-visible.
dimVariables[i]->storagePtr =
ctx->AllocaInst(LLVMTypes::Int32VectorType,
dimVariables[i]->name.c_str());
dimVariables[i]->parentFunction = ctx->GetFunction();
ctx->EmitVariableDebugInfo(dimVariables[i]);
// Each dimension also maintains a mask that represents which of
// the varying elements in the current iteration should be
// processed. (i.e. this is used to disable the lanes that have
// out-of-bounds offsets.)
extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
}
ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
// On to the outermost loop's test
ctx->BranchInst(bbTest[0]);
///////////////////////////////////////////////////////////////////////////
// foreach_reset: this code runs when we need to reset the counter for
// a given dimension in preparation for running through its loop again,
// after the enclosing level advances its counter.
for (int i = 0; i < nDims; ++i) {
ctx->SetCurrentBasicBlock(bbReset[i]);
if (i == 0)
ctx->BranchInst(bbExit);
else {
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
ctx->BranchInst(bbStep[i-1]);
}
}
///////////////////////////////////////////////////////////////////////////
// foreach_step: increment the uniform counter by the vector width.
// Note that we don't increment the varying counter here as well but
// just generate its value when we need it in the loop body. Don't do
// this for the innermost dimension, which has a more complex stepping
// structure..
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbStep[i]);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[i]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
ctx->BranchInst(bbTest[i]);
}
///////////////////////////////////////////////////////////////////////////
// foreach_test (for all dimensions other than the innermost...)
std::vector<llvm::Value *> inExtras;
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbTest[i]);
llvm::Value *haveExtras =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
endVals[i], alignedEnd[i], "have_extras");
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
llvm::Value *atAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
counter, alignedEnd[i], "at_aligned_end");
llvm::Value *inEx =
ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
atAlignedEnd, "in_extras");
if (i == 0)
inExtras.push_back(inEx);
else
inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
inExtras[i-1], "in_extras_all"));
llvm::Value *varyingCounter =
lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
dimVariables[i]->storagePtr, span);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[i], LLVMTypes::Int32VectorType, "smear_end");
// Do a vector compare of its value to the end value to generate a
// mask for this last bit of work.
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
if (i == 0)
ctx->StoreInst(emask, extrasMaskPtrs[i]);
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->StoreInst(newMask, extrasMaskPtrs[i]);
}
llvm::Value *notAtEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[i]);
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
}
///////////////////////////////////////////////////////////////////////////
// foreach_test (for innermost dimension)
//
// All of the outer dimensions are handled generically--basically as a
// for() loop from the start value to the end value, where at each loop
// test, we compute the mask of active elements for the current
// dimension and then update an overall mask that is the AND
// combination of all of the outer ones.
//
// The innermost loop is handled specially, for performance purposes.
// When starting the innermost dimension, we start by checking once
// whether any of the outer dimensions has set the mask to be
// partially-active or not. We follow different code paths for these
// two cases, taking advantage of the knowledge that the mask is all
// on, when this is the case.
//
// In each of these code paths, we start with a loop from the starting
// value to the aligned end value for the innermost dimension; we can
// guarantee that the innermost loop will have an "all on" mask (as far
// as its dimension is concerned) for the duration of this loop. Doing
// so allows us to emit code that assumes the mask is all on (for the
// case where none of the outer dimensions has set the mask to be
// partially on), or allows us to emit code that just uses the mask
// from the outer dimensions directly (for the case where they have).
//
// After this loop, we just need to deal with one vector's worth of
// "ragged extra bits", where the mask used includes the effect of the
// mask for the innermost dimension.
//
// We start out this process by emitting the check that determines
// whether any of the enclosing dimensions is partially active
// (i.e. processing extra elements that don't exactly fit into a
// vector).
llvm::BasicBlock *bbOuterInExtras =
ctx->CreateBasicBlock("outer_in_extras");
llvm::BasicBlock *bbOuterNotInExtras =
ctx->CreateBasicBlock("outer_not_in_extras");
ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
if (inExtras.size())
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
inExtras.back());
else
// for a 1D iteration domain, we certainly don't have any enclosing
// dimensions that are processing extra elements.
ctx->BranchInst(bbOuterNotInExtras);
///////////////////////////////////////////////////////////////////////////
// One or more outer dimensions in extras, so we need to mask for the loop
// body regardless. We break this into two cases, roughly:
// for (counter = start; counter < alignedEnd; counter += step) {
// // mask is all on for inner, so set mask to outer mask
// // run loop body with mask
// }
// // counter == alignedEnd
// if (counter < end) {
// // set mask to outermask & (counter+programCounter < end)
// // run loop body with mask
// }
llvm::BasicBlock *bbAllInnerPartialOuter =
ctx->CreateBasicBlock("all_inner_partial_outer");
llvm::BasicBlock *bbPartial =
ctx->CreateBasicBlock("both_partial");
ctx->SetCurrentBasicBlock(bbOuterInExtras); {
// Update the varying counter value here, since all subsequent
// blocks along this path need it.
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
// here we just check to see if counter < alignedEnd
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
}
// Below we have a basic block that runs the loop body code for the
// case where the mask is partially but not fully on. This same block
// runs in multiple cases: both for handling any ragged extra data for
// the innermost dimension but also when outer dimensions have set the
// mask to be partially on.
//
// The value stored in stepIndexAfterMaskedBodyPtr is used after each
// execution of the body code to determine whether the innermost index
// value should be incremented by the step (we're running the "for"
// loop of full vectors at the innermost dimension, with outer
// dimensions having set the mask to be partially on), or whether we're
// running once for the ragged extra bits at the end of the innermost
// dimension, in which case we're done with the innermost dimension and
// should step the loop counter for the next enclosing dimension
// instead.
llvm::Value *stepIndexAfterMaskedBodyPtr =
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
///////////////////////////////////////////////////////////////////////////
// We're in the inner loop part where the only masking is due to outer
// dimensions but the innermost dimension fits fully into a vector's
// width. Set the mask and jump to the masked loop body.
ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
llvm::Value *mask;
if (nDims == 1)
// 1D loop; we shouldn't ever get here anyway
mask = LLVMMaskAllOff;
else
mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
ctx->SetInternalMask(mask);
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// We need to include the effect of the innermost dimension in the mask
// for the final bits here
ctx->SetCurrentBasicBlock(bbPartial); {
llvm::Value *varyingCounter =
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
if (nDims == 1) {
ctx->SetInternalMask(emask);
}
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->SetInternalMask(newMask);
}
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// None of the outer dimensions is processing extras; along the lines
// of above, we can express this as:
// for (counter = start; counter < alignedEnd; counter += step) {
// // mask is all on
// // run loop body with mask all on
// }
// // counter == alignedEnd
// if (counter < end) {
// // set mask to (counter+programCounter < end)
// // run loop body with mask
// }
llvm::BasicBlock *bbPartialInnerAllOuter =
ctx->CreateBasicBlock("partial_inner_all_outer");
ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
beforeAlignedEnd);
}
///////////////////////////////////////////////////////////////////////////
// full_body: do a full vector's worth of work. We know that all
// lanes will be running here, so we explicitly set the mask to be 'all
// on'. This ends up being relatively straightforward: just update the
// value of the varying loop counter and have the statements in the
// loop body emit their code.
llvm::BasicBlock *bbFullBodyContinue =
ctx->CreateBasicBlock("foreach_full_continue");
ctx->SetCurrentBasicBlock(bbFullBody); {
ctx->SetInternalMask(LLVMMaskAllOn);
ctx->SetBlockEntryMask(LLVMMaskAllOn);
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
ctx->SetContinueTarget(bbFullBodyContinue);
ctx->AddInstrumentationPoint("foreach loop body (all on)");
stmts->EmitCode(ctx);
AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
ctx->BranchInst(bbFullBodyContinue);
}
ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterNotInExtras);
}
///////////////////////////////////////////////////////////////////////////
// We're done running blocks with the mask all on; see if the counter is
// less than the end value, in which case we need to run the body one
// more time to get the extra bits.
llvm::BasicBlock *bbSetInnerMask =
ctx->CreateBasicBlock("partial_inner_only");
ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeFullEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[nDims-1], "before_full_end");
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
}
///////////////////////////////////////////////////////////////////////////
// The outer dimensions are all on, so the mask is just given by the
// mask for the innermost dimension
ctx->SetCurrentBasicBlock(bbSetInnerMask); {
llvm::Value *varyingCounter =
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
ctx->SetInternalMask(emask);
ctx->SetBlockEntryMask(emask);
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// masked_body: set the mask and have the statements emit their
// code again. Note that it's generally worthwhile having two copies
// of the statements' code, since the code above is emitted with the
// mask known to be all-on, which in turn leads to more efficient code
// for that case.
llvm::BasicBlock *bbStepInnerIndex =
ctx->CreateBasicBlock("step_inner_index");
llvm::BasicBlock *bbMaskedBodyContinue =
ctx->CreateBasicBlock("foreach_masked_continue");
ctx->SetCurrentBasicBlock(bbMaskedBody); {
ctx->AddInstrumentationPoint("foreach loop body (masked)");
ctx->SetContinueTarget(bbMaskedBodyContinue);
ctx->DisableGatherScatterWarnings();
ctx->SetBlockEntryMask(ctx->GetFullMask());
stmts->EmitCode(ctx);
ctx->EnableGatherScatterWarnings();
ctx->BranchInst(bbMaskedBodyContinue);
}
ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
}
///////////////////////////////////////////////////////////////////////////
// step the innermost index, for the case where we're doing the
// innermost for loop over full vectors.
ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterInExtras);
}
///////////////////////////////////////////////////////////////////////////
// foreach_exit: All done. Restore the old mask and clean up
ctx->SetCurrentBasicBlock(bbExit);
ctx->SetInternalMask(oldMask);
ctx->SetFunctionMask(oldFunctionMask);
ctx->EndForeach();
ctx->EndScope();
}
else /* isPTX() == true */
{
llvm::BasicBlock *bbFullBody = ctx->CreateBasicBlock("foreach_full_body");
llvm::BasicBlock *bbMaskedBody = ctx->CreateBasicBlock("foreach_masked_body");
llvm::BasicBlock *bbExit = ctx->CreateBasicBlock("foreach_exit");
llvm::Value *oldMask = ctx->GetInternalMask();
llvm::Value *oldFunctionMask = ctx->GetFunctionMask();
ctx->SetDebugPos(pos);
ctx->StartScope();
ctx->SetInternalMask(LLVMMaskAllOn);
ctx->SetFunctionMask(LLVMMaskAllOn);
// This should be caught during typechecking
AssertPos(pos, startExprs.size() == dimVariables.size() &&
endExprs.size() == dimVariables.size());
int nDims = (int)dimVariables.size();
///////////////////////////////////////////////////////////////////////
// Setup: compute the number of items we have to work on in each
// dimension and a number of derived values.
std::vector<llvm::BasicBlock *> bbReset, bbStep, bbTest;
std::vector<llvm::Value *> startVals, endVals, uniformCounterPtrs;
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
std::vector<int> span(nDims, 0);
const int vectorWidth = 32;
const int vectorWidth =
g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
#if 0
for (int i = 0; i < nDims; i++)
{
fprintf(stderr, " i= %d [ %d ] : %d \n",
i, nDims, span[i]);
}
fprintf(stderr, " --- \n");
#endif
for (int i = 0; i < nDims; ++i) {
// Basic blocks that we'll fill in later with the looping logic for
@@ -2380,7 +1918,6 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
ctx->EndForeach();
ctx->EndScope();
}
}
Stmt *

View File

@@ -2925,7 +2925,7 @@ FunctionType::GetReturnTypeString() const {
llvm::FunctionType *
FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
if (isTask == true) // && !g->target->isPTX()) //getISA() != Target::NVPTX64)
if (isTask == true)
Assert(removeMask == false);
// Get the LLVM Type *s for the function arguments
@@ -2950,15 +2950,12 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
llvmArgTypes.push_back(LLVMTypes::MaskType);
std::vector<llvm::Type *> callTypes;
if (isTask) {
if (isTask && g->target->getISA() != Target::NVPTX) {
// Tasks take three arguments: a pointer to a struct that holds the
// actual task arguments, the thread index, and the total number of
// threads the tasks system has running. (Task arguments are
// marshalled in a struct so that it's easy to allocate space to
// hold them until the task actually runs.)
// if (g->target->getISA() != Target::NVPTX64)
if (!g->target->isPTX())
{
llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
callTypes.push_back(llvm::PointerType::getUnqual(st));
callTypes.push_back(LLVMTypes::Int32Type); // threadIndex
@@ -2972,22 +2969,11 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
callTypes.push_back(LLVMTypes::Int32Type); // taskCount1
callTypes.push_back(LLVMTypes::Int32Type); // taskCount2
}
else
{
if (g->target->getISA() == Target::NVPTX64)
callTypes = llvmArgTypes;
else
{
assert(0); /* evghenii: must be removed in final, just for test for nvptx64 target */
llvm::Type *st = llvm::StructType::get(*ctx, llvmArgTypes);
callTypes.push_back(llvm::PointerType::getUnqual(st));
}
}
}
else
// Otherwise we already have the types of the arguments
callTypes = llvmArgTypes;
if (returnType == NULL) {
Assert(m->errorCount > 0);
return NULL;