From 83f22f19394fb6a92567d7fb78c64ab10456514e Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Mon, 12 Sep 2011 12:29:33 -0700 Subject: [PATCH] Add experimental --fast-masked-vload flag for SSE. --- builtins.cpp | 23 +++++++++++++++++++++++ builtins.m4 | 12 +++++++++++- ispc.cpp | 1 + ispc.h | 6 ++++++ main.cpp | 3 +++ 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/builtins.cpp b/builtins.cpp index c38001f7..c82cb071 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module, } + +static void +lDefineConstantIntFunc(const char *name, int val, llvm::Module *module, + SymbolTable *symbolTable) { + std::vector args; + FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos()); + Symbol *sym = new Symbol(name, SourcePos(), ft); + sym->isStatic = true; + + llvm::Function *func = module->getFunction(name); + assert(func != NULL); // it should be declared already... + func->addFnAttr(llvm::Attribute::AlwaysInline); + llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0); + llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock); + + sym->function = func; + symbolTable->AddVariable(sym); +} + + + static void lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) { Symbol *pidx = new Symbol("programIndex", SourcePos(), @@ -492,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod symbolTable); lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, symbolTable); + lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module, + symbolTable); if (includeStdlibISPC) { // If the user wants the standard library to be included, parse the diff --git a/builtins.m4 b/builtins.m4 index b3a1da0e..13295b9d 100644 --- a/builtins.m4 +++ b/builtins.m4 @@ -851,6 +851,8 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline { define(`stdlib_core', ` +declare i32 @__fast_masked_vload() + declare i8* @ISPCMalloc(i64, i32) nounwind declare i8* @ISPCFree(i8*) nounwind declare void @ISPCLaunch(i8*, i8*) nounwind @@ -1375,14 +1377,22 @@ define(`load_masked', ` define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline { entry: %mm = call i32 @__movmsk(<$1 x i32> %mask) + ; if the first lane and the last lane are on, then it is safe to do a vector load ; of the whole thing--what the lanes in the middle want turns out to not matter... %mm_and = and i32 %mm, eval(1 | (1<<($1-1))) %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1))) + + %mm_not_zero = icmp ne i32 %mm, 0 + %fast32 = call i32 @__fast_masked_vload() + %fast_i1 = trunc i32 %fast32 to i1 + %vload_fast = and i1 %mm_not_zero, %fast_i1 + %can_vload_maybe_fast = or i1 %vload_fast, %can_vload + ; if we are not able to do a singe vload, we will accumulate lanes in this memory.. %retptr = alloca <$1 x $2> %retptr32 = bitcast <$1 x $2> * %retptr to $2 * - br i1 %can_vload, label %load, label %loop + br i1 %can_vload_maybe_fast, label %load, label %loop load: %ptr = bitcast i8 * %0 to <$1 x $2> * diff --git a/ispc.cpp b/ispc.cpp index f0810c85..cf9e307f 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -255,6 +255,7 @@ Target::GetTargetMachine() const { Opt::Opt() { level = 1; fastMath = false; + fastMaskedVload = false; disableBlendedMaskedStores = false; disableCoherentControlFlow = false; disableUniformControlFlow = false; diff --git a/ispc.h b/ispc.h index ae249e70..8536732d 100644 --- a/ispc.h +++ b/ispc.h @@ -238,6 +238,12 @@ struct Opt { should be performed. This is false by default. */ bool fastMath; + /** Indicates whether an vector load should be issued for masked loads + on platforms that don't have a native masked vector load. (This may + lead to accessing memory up to programCount-1 elements past the end of + arrays, so is unsafe in general.) */ + bool fastMaskedVload; + /** On targets that don't have a masked store instruction but do have a blending instruction, by default, we simulate masked stores by loading the old value, blending, and storing the result. This can diff --git a/main.cpp b/main.cpp index 70fdc0db..9997f5ee 100644 --- a/main.cpp +++ b/main.cpp @@ -74,6 +74,7 @@ static void usage(int ret) { printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n"); printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n"); printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); + printf(" [--fast-masked-vload]\tFaster masked vector loads on SSE (may go past end of array)\n"); printf(" [-g]\t\t\t\tGenerate debugging information\n"); printf(" [--help]\t\t\t\tPrint help\n"); printf(" [-h /--header-outfile=]\tOutput filename for header\n"); @@ -199,6 +200,8 @@ int main(int Argc, char *Argv[]) { cpu = argv[i] + 6; else if (!strcmp(argv[i], "--fast-math")) g->opt.fastMath = true; + else if (!strcmp(argv[i], "--fast-masked-vload")) + g->opt.fastMaskedVload = true; else if (!strcmp(argv[i], "--debug")) g->debugPrint = true; else if (!strcmp(argv[i], "--instrument"))