From 83f22f19394fb6a92567d7fb78c64ab10456514e Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Mon, 12 Sep 2011 12:29:33 -0700
Subject: [PATCH] Add experimental --fast-masked-vload flag for SSE.

---
 builtins.cpp | 23 +++++++++++++++++++++++
 builtins.m4  | 12 +++++++++++-
 ispc.cpp     |  1 +
 ispc.h       |  6 ++++++
 main.cpp     |  3 +++
 5 files changed, 44 insertions(+), 1 deletion(-)
diff --git a/builtins.cpp b/builtins.cpp
index c38001f7..c82cb071 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -389,6 +389,27 @@ lDefineConstantInt(const char *name, int val, llvm::Module *module,
 }
 
 
+
+static void
+lDefineConstantIntFunc(const char *name, int val, llvm::Module *module,
+                       SymbolTable *symbolTable) {
+    std::vector<const Type *> args;
+    FunctionType *ft = new FunctionType(AtomicType::UniformInt32, args, SourcePos());
+    Symbol *sym = new Symbol(name, SourcePos(), ft);
+    sym->isStatic = true;
+
+    llvm::Function *func = module->getFunction(name);
+    assert(func != NULL); // it should be declared already...
+    func->addFnAttr(llvm::Attribute::AlwaysInline);
+    llvm::BasicBlock *bblock = llvm::BasicBlock::Create(*g->ctx, "entry", func, 0);
+    llvm::ReturnInst::Create(*g->ctx, LLVMInt32(val), bblock);
+
+    sym->function = func;
+    symbolTable->AddVariable(sym);
+}
+
+
+
 static void
 lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) {
     Symbol *pidx = new Symbol("programIndex", SourcePos(), 
@@ -492,6 +513,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
                        symbolTable);
     lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module,
                        symbolTable);
+    lDefineConstantIntFunc("__fast_masked_vload", (int)g->opt.fastMaskedVload, module,
+                           symbolTable);
 
     if (includeStdlibISPC) {
         // If the user wants the standard library to be included, parse the
diff --git a/builtins.m4 b/builtins.m4
index b3a1da0e..13295b9d 100644
--- a/builtins.m4
+++ b/builtins.m4
@@ -851,6 +851,8 @@ define internal void @__prefetch_read_nt_$1($2 *) alwaysinline {
 
 define(`stdlib_core', `
 
+declare i32 @__fast_masked_vload()
+
 declare i8* @ISPCMalloc(i64, i32) nounwind
 declare i8* @ISPCFree(i8*) nounwind
 declare void @ISPCLaunch(i8*, i8*) nounwind
@@ -1375,14 +1377,22 @@ define(`load_masked', `
 define <$1 x $2> @__load_masked_$3(i8 *, <$1 x i32> %mask) nounwind alwaysinline {
 entry:
   %mm = call i32 @__movmsk(<$1 x i32> %mask)
+  
   ; if the first lane and the last lane are on, then it is safe to do a vector load
   ; of the whole thing--what the lanes in the middle want turns out to not matter...
   %mm_and = and i32 %mm, eval(1 | (1<<($1-1)))
   %can_vload = icmp eq i32 %mm_and, eval(1 | (1<<($1-1)))
+
+  %mm_not_zero = icmp ne i32 %mm, 0
+  %fast32 = call i32 @__fast_masked_vload()
+  %fast_i1 = trunc i32 %fast32 to i1
+  %vload_fast = and i1 %mm_not_zero, %fast_i1
+  %can_vload_maybe_fast = or i1 %vload_fast, %can_vload
+
   ; if we are not able to do a singe vload, we will accumulate lanes in this memory..
   %retptr = alloca <$1 x $2>
   %retptr32 = bitcast <$1 x $2> * %retptr to $2 *
-  br i1 %can_vload, label %load, label %loop
+  br i1 %can_vload_maybe_fast, label %load, label %loop
 
 load: 
   %ptr = bitcast i8 * %0 to <$1 x $2> *
diff --git a/ispc.cpp b/ispc.cpp
index f0810c85..cf9e307f 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -255,6 +255,7 @@ Target::GetTargetMachine() const {
 Opt::Opt() {
     level = 1;
     fastMath = false;
+    fastMaskedVload = false;
     disableBlendedMaskedStores = false;
     disableCoherentControlFlow = false;
     disableUniformControlFlow = false;
diff --git a/ispc.h b/ispc.h
index ae249e70..8536732d 100644
--- a/ispc.h
+++ b/ispc.h
@@ -238,6 +238,12 @@ struct Opt {
         should be performed.  This is false by default. */
     bool fastMath;
 
+    /** Indicates whether an vector load should be issued for masked loads
+        on platforms that don't have a native masked vector load.  (This may
+        lead to accessing memory up to programCount-1 elements past the end of
+        arrays, so is unsafe in general.) */
+    bool fastMaskedVload;
+
     /** On targets that don't have a masked store instruction but do have a
         blending instruction, by default, we simulate masked stores by
         loading the old value, blending, and storing the result.  This can
diff --git a/main.cpp b/main.cpp
index 70fdc0db..9997f5ee 100644
--- a/main.cpp
+++ b/main.cpp
@@ -74,6 +74,7 @@ static void usage(int ret) {
     printf("    [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n");
     printf("    [--emit-obj]\t\t\tGenerate object file file as output (default)\n");
     printf("    [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("    [--fast-masked-vload]\tFaster masked vector loads on SSE (may go past end of array)\n");
     printf("    [-g]\t\t\t\tGenerate debugging information\n");
     printf("    [--help]\t\t\t\tPrint help\n");
     printf("    [-h <name>/--header-outfile=<name>]\tOutput filename for header\n");
@@ -199,6 +200,8 @@ int main(int Argc, char *Argv[]) {
             cpu = argv[i] + 6;
         else if (!strcmp(argv[i], "--fast-math"))
             g->opt.fastMath = true;
+        else if (!strcmp(argv[i], "--fast-masked-vload"))
+            g->opt.fastMaskedVload = true;
         else if (!strcmp(argv[i], "--debug"))
             g->debugPrint = true;
         else if (!strcmp(argv[i], "--instrument"))