Add experimental --fast-masked-vload flag for SSE.

This commit is contained in:
Matt Pharr
2011-09-12 12:29:33 -07:00
parent 6375ed9224
commit 83f22f1939
5 changed files with 44 additions and 1 deletions

View File

@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
} }
static void
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
SymbolTable *symbolTable) {
std::vector<const Type *> args;
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
Symbol *sym = new Symbol(name, SourcePos(), ft);
sym->isStatic = true;
llvm::Function *func = module->getFunction(name);
assert(func != NULL); // it should be declared already...
func->addFnAttr(llvm::Attribute::AlwaysInline);
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
sym->function = func;
symbolTable->AddVariable(sym);
}
static void static void
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) { lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
Symbol *pidx = new Symbol("programIndex", SourcePos(), Symbol *pidx = new Symbol("programIndex", SourcePos(),
@@ -492,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
symbolTable); symbolTable);
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
symbolTable); symbolTable);
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
symbolTable);
if (includeStdlibISPC) { if (includeStdlibISPC) {
// If the user wants the standard library to be included, parse the // If the user wants the standard library to be included, parse the

View File

@@ -851,6 +851,8 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
define(`stdlib_core', ` define(`stdlib_core', `
declare i32 @__fast_masked_vload()
declare i8* @ISPCMalloc(i64, i32) nounwind declare i8* @ISPCMalloc(i64, i32) nounwind
declare i8* @ISPCFree(i8*) nounwind declare i8* @ISPCFree(i8*) nounwind
declare void @ISPCLaunch(i8*, i8*) nounwind declare void @ISPCLaunch(i8*, i8*) nounwind
@@ -1375,14 +1377,22 @@ define(`load_masked', `
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline { define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
entry: entry:
%mm = call i32 @__movmsk(<$1 x i32> %mask) %mm = call i32 @__movmsk(<$1 x i32> %mask)
; if the first lane and the last lane are on, then it is safe to do a vector load ; if the first lane and the last lane are on, then it is safe to do a vector load
; of the whole thing--what the lanes in the middle want turns out to not matter... ; of the whole thing--what the lanes in the middle want turns out to not matter...
%mm_and = and i32 %mm, eval(1 | (1<<($1-1))) %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1))) %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
%mm_not_zero = icmp ne i32 %mm, 0
%fast32 = call i32 @__fast_masked_vload()
%fast_i1 = trunc i32 %fast32 to i1
%vload_fast = and i1 %mm_not_zero, %fast_i1
%can_vload_maybe_fast = or i1 %vload_fast, %can_vload
; if we are not able to do a singe vload, we will accumulate lanes in this memory.. ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
%retptr = alloca <$1 x $2> %retptr = alloca <$1 x $2>
%retptr32 = bitcast <$1 x $2> * %retptr to $2 * %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
br i1 %can_vload, label %load, label %loop br i1 %can_vload_maybe_fast, label %load, label %loop
load: load:
%ptr = bitcast i8 * %0 to <$1 x $2> * %ptr = bitcast i8 * %0 to <$1 x $2> *

View File

@@ -255,6 +255,7 @@ Target::GetTargetMachine() const {
Opt::Opt() { Opt::Opt() {
level = 1; level = 1;
fastMath = false; fastMath = false;
fastMaskedVload = false;
disableBlendedMaskedStores = false; disableBlendedMaskedStores = false;
disableCoherentControlFlow = false; disableCoherentControlFlow = false;
disableUniformControlFlow = false; disableUniformControlFlow = false;

6
ispc.h
View File

@@ -238,6 +238,12 @@ struct Opt {
should be performed. This is false by default. */ should be performed. This is false by default. */
bool fastMath; bool fastMath;
/** Indicates whether an vector load should be issued for masked loads
on platforms that don't have a native masked vector load. (This may
lead to accessing memory up to programCount-1 elements past the end of
arrays, so is unsafe in general.) */
bool fastMaskedVload;
/** On targets that don't have a masked store instruction but do have a /** On targets that don't have a masked store instruction but do have a
blending instruction, by default, we simulate masked stores by blending instruction, by default, we simulate masked stores by
loading the old value, blending, and storing the result. This can loading the old value, blending, and storing the result. This can

View File

@@ -74,6 +74,7 @@ static void usage(int ret) {
printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n"); printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n"); printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
printf(" [--fast-masked-vload]\tFaster masked vector loads on SSE (may go past end of array)\n");
printf(" [-g]\t\t\t\tGenerate debugging information\n"); printf(" [-g]\t\t\t\tGenerate debugging information\n");
printf(" [--help]\t\t\t\tPrint help\n"); printf(" [--help]\t\t\t\tPrint help\n");
printf(" [-h <name>/--header-outfile=<name>]\tOutput filename for header\n"); printf(" [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
@@ -199,6 +200,8 @@ int main(int Argc, char *Argv[]) {
cpu = argv[i] + 6; cpu = argv[i] + 6;
else if (!strcmp(argv[i], "--fast-math")) else if (!strcmp(argv[i], "--fast-math"))
g->opt.fastMath = true; g->opt.fastMath = true;
else if (!strcmp(argv[i], "--fast-masked-vload"))
g->opt.fastMaskedVload = true;
else if (!strcmp(argv[i], "--debug")) else if (!strcmp(argv[i], "--debug"))
g->debugPrint = true; g->debugPrint = true;
else if (!strcmp(argv[i], "--instrument")) else if (!strcmp(argv[i], "--instrument"))