Add experimental --fast-masked-vload flag for SSE.
This commit is contained in:
23
builtins.cpp
23
builtins.cpp
@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
|
||||||
|
SymbolTable *symbolTable) {
|
||||||
|
std::vector<const Type *> args;
|
||||||
|
FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
|
||||||
|
Symbol *sym = new Symbol(name, SourcePos(), ft);
|
||||||
|
sym->isStatic = true;
|
||||||
|
|
||||||
|
llvm::Function *func = module->getFunction(name);
|
||||||
|
assert(func != NULL); // it should be declared already...
|
||||||
|
func->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||||
|
llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
|
||||||
|
llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
|
||||||
|
|
||||||
|
sym->function = func;
|
||||||
|
symbolTable->AddVariable(sym);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
|
||||||
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
Symbol *pidx = new Symbol("programIndex", SourcePos(),
|
||||||
@@ -492,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
symbolTable);
|
symbolTable);
|
||||||
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
|
||||||
symbolTable);
|
symbolTable);
|
||||||
|
lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
|
||||||
|
symbolTable);
|
||||||
|
|
||||||
if (includeStdlibISPC) {
|
if (includeStdlibISPC) {
|
||||||
// If the user wants the standard library to be included, parse the
|
// If the user wants the standard library to be included, parse the
|
||||||
|
|||||||
12
builtins.m4
12
builtins.m4
@@ -851,6 +851,8 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
|
|||||||
|
|
||||||
define(`stdlib_core', `
|
define(`stdlib_core', `
|
||||||
|
|
||||||
|
declare i32 @__fast_masked_vload()
|
||||||
|
|
||||||
declare i8* @ISPCMalloc(i64, i32) nounwind
|
declare i8* @ISPCMalloc(i64, i32) nounwind
|
||||||
declare i8* @ISPCFree(i8*) nounwind
|
declare i8* @ISPCFree(i8*) nounwind
|
||||||
declare void @ISPCLaunch(i8*, i8*) nounwind
|
declare void @ISPCLaunch(i8*, i8*) nounwind
|
||||||
@@ -1375,14 +1377,22 @@ define(`load_masked', `
|
|||||||
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
|
||||||
entry:
|
entry:
|
||||||
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
%mm = call i32 @__movmsk(<$1 x i32> %mask)
|
||||||
|
|
||||||
; if the first lane and the last lane are on, then it is safe to do a vector load
|
; if the first lane and the last lane are on, then it is safe to do a vector load
|
||||||
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
; of the whole thing--what the lanes in the middle want turns out to not matter...
|
||||||
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
|
%mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
|
||||||
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
|
%can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
|
||||||
|
|
||||||
|
%mm_not_zero = icmp ne i32 %mm, 0
|
||||||
|
%fast32 = call i32 @__fast_masked_vload()
|
||||||
|
%fast_i1 = trunc i32 %fast32 to i1
|
||||||
|
%vload_fast = and i1 %mm_not_zero, %fast_i1
|
||||||
|
%can_vload_maybe_fast = or i1 %vload_fast, %can_vload
|
||||||
|
|
||||||
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
; if we are not able to do a singe vload, we will accumulate lanes in this memory..
|
||||||
%retptr = alloca <$1 x $2>
|
%retptr = alloca <$1 x $2>
|
||||||
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
|
%retptr32 = bitcast <$1 x $2> * %retptr to $2 *
|
||||||
br i1 %can_vload, label %load, label %loop
|
br i1 %can_vload_maybe_fast, label %load, label %loop
|
||||||
|
|
||||||
load:
|
load:
|
||||||
%ptr = bitcast i8 * %0 to <$1 x $2> *
|
%ptr = bitcast i8 * %0 to <$1 x $2> *
|
||||||
|
|||||||
1
ispc.cpp
1
ispc.cpp
@@ -255,6 +255,7 @@ Target::GetTargetMachine() const {
|
|||||||
Opt::Opt() {
|
Opt::Opt() {
|
||||||
level = 1;
|
level = 1;
|
||||||
fastMath = false;
|
fastMath = false;
|
||||||
|
fastMaskedVload = false;
|
||||||
disableBlendedMaskedStores = false;
|
disableBlendedMaskedStores = false;
|
||||||
disableCoherentControlFlow = false;
|
disableCoherentControlFlow = false;
|
||||||
disableUniformControlFlow = false;
|
disableUniformControlFlow = false;
|
||||||
|
|||||||
6
ispc.h
6
ispc.h
@@ -238,6 +238,12 @@ struct Opt {
|
|||||||
should be performed. This is false by default. */
|
should be performed. This is false by default. */
|
||||||
bool fastMath;
|
bool fastMath;
|
||||||
|
|
||||||
|
/** Indicates whether an vector load should be issued for masked loads
|
||||||
|
on platforms that don't have a native masked vector load. (This may
|
||||||
|
lead to accessing memory up to programCount-1 elements past the end of
|
||||||
|
arrays, so is unsafe in general.) */
|
||||||
|
bool fastMaskedVload;
|
||||||
|
|
||||||
/** On targets that don't have a masked store instruction but do have a
|
/** On targets that don't have a masked store instruction but do have a
|
||||||
blending instruction, by default, we simulate masked stores by
|
blending instruction, by default, we simulate masked stores by
|
||||||
loading the old value, blending, and storing the result. This can
|
loading the old value, blending, and storing the result. This can
|
||||||
|
|||||||
3
main.cpp
3
main.cpp
@@ -74,6 +74,7 @@ static void usage(int ret) {
|
|||||||
printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
|
printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
|
||||||
printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
|
printf(" [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
|
||||||
printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
|
printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
|
||||||
|
printf(" [--fast-masked-vload]\tFaster masked vector loads on SSE (may go past end of array)\n");
|
||||||
printf(" [-g]\t\t\t\tGenerate debugging information\n");
|
printf(" [-g]\t\t\t\tGenerate debugging information\n");
|
||||||
printf(" [--help]\t\t\t\tPrint help\n");
|
printf(" [--help]\t\t\t\tPrint help\n");
|
||||||
printf(" [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
|
printf(" [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
|
||||||
@@ -199,6 +200,8 @@ int main(int Argc, char *Argv[]) {
|
|||||||
cpu = argv[i] + 6;
|
cpu = argv[i] + 6;
|
||||||
else if (!strcmp(argv[i], "--fast-math"))
|
else if (!strcmp(argv[i], "--fast-math"))
|
||||||
g->opt.fastMath = true;
|
g->opt.fastMath = true;
|
||||||
|
else if (!strcmp(argv[i], "--fast-masked-vload"))
|
||||||
|
g->opt.fastMaskedVload = true;
|
||||||
else if (!strcmp(argv[i], "--debug"))
|
else if (!strcmp(argv[i], "--debug"))
|
||||||
g->debugPrint = true;
|
g->debugPrint = true;
|
||||||
else if (!strcmp(argv[i], "--instrument"))
|
else if (!strcmp(argv[i], "--instrument"))
|
||||||
|
|||||||
Reference in New Issue
Block a user