From 3bf3ac792216217ff3c0fbe93e54b87927a992ff Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Tue, 17 Jan 2012 18:28:55 -0800
Subject: [PATCH] Be more conservative about using blending in place of masked
 store.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

More specifically, we do a proper masked store (rather than a load-
blend-store) unless we can determine that we're accessing a stack-allocated
"varying" variable.  This fixes a number of nefarious bugs where given
code like:

    uniform float a[21];
    foreach (i = 0 … 21)
        a[i] = 0;

We'd use a blend and in turn read past the end of a[] in the last
iteration.

Also made slight changes to inlining in aobench; this keeps compiles
to ~5s, versus ~45s without them (with this change).

Fixes issue #160.
---
 examples/aobench/ao.ispc |  4 ++--
 opt.cpp                  | 34 ++++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc
index 589a98b1..ffd85d29 100644
--- a/examples/aobench/ao.ispc
+++ b/examples/aobench/ao.ispc
@@ -82,7 +82,7 @@ static inline void vnormalize(vec &v) {
 }
 
 
-static inline void
+static void
 ray_plane_intersect(Isect &isect, Ray &ray, Plane &plane) {
     float d = -dot(plane.p, plane.n);
     float v = dot(ray.dir, plane.n);
@@ -147,7 +147,7 @@ orthoBasis(vec basis[3], vec n) {
 }
 
 
-static inline float
+static float
 ambient_occlusion(Isect &isect, Plane &plane, Sphere spheres[3], 
                   RNGState &rngstate) {
     float eps = 0.0001f;
diff --git a/opt.cpp b/opt.cpp
index ebd6b8e0..b1f5c7a0 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -1674,18 +1674,32 @@ llvm::RegisterPass<LowerMaskedStorePass> lms("masked-store-lower",
     comes from an AllocaInst.
 */
 static bool
-lIsStackVariablePointer(llvm::Value *lvalue) {
+lIsSafeToBlend(llvm::Value *lvalue) {
     llvm::BitCastInst *bc = llvm::dyn_cast<llvm::BitCastInst>(lvalue);
-    if (bc)
-        return lIsStackVariablePointer(bc->getOperand(0));
+    if (bc != NULL)
+        return lIsSafeToBlend(bc->getOperand(0));
     else {
         llvm::AllocaInst *ai = llvm::dyn_cast<llvm::AllocaInst>(lvalue);
-        if (ai)
-            return true;
+        if (ai) {
+            LLVM_TYPE_CONST llvm::Type *type = ai->getType();
+            LLVM_TYPE_CONST llvm::PointerType *pt = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::PointerType>(type);
+            assert(pt != NULL);
+            type = pt->getElementType();
+            LLVM_TYPE_CONST llvm::ArrayType *at;
+            while ((at = llvm::dyn_cast<LLVM_TYPE_CONST llvm::ArrayType>(type))) {
+                type = at->getElementType();
+            }
+            LLVM_TYPE_CONST llvm::VectorType *vt = 
+                llvm::dyn_cast<LLVM_TYPE_CONST llvm::VectorType>(type);
+            return (vt != NULL && 
+                    (int)vt->getNumElements() == g->target.vectorWidth);
+        }
         else {
-            llvm::GetElementPtrInst *gep = llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
-            if (gep)
-                return lIsStackVariablePointer(gep->getOperand(0));
+            llvm::GetElementPtrInst *gep = 
+                llvm::dyn_cast<llvm::GetElementPtrInst>(lvalue);
+            if (gep != NULL)
+                return lIsSafeToBlend(gep->getOperand(0));
             else
                 return false;
         }
@@ -1747,8 +1761,8 @@ LowerMaskedStorePass::runOnBasicBlock(llvm::BasicBlock &bb) {
         // or serializing the masked store.  Even on targets with a native
         // masked store instruction, this is preferable since it lets us
         // keep values in registers rather than going out to the stack.
-        bool doBlend = (!g->opt.disableBlendedMaskedStores ||
-                        lIsStackVariablePointer(lvalue));
+        bool doBlend = (!g->opt.disableBlendedMaskedStores &&
+                        lIsSafeToBlend(lvalue));
 
         // Generate the call to the appropriate masked store function and
         // replace the __pseudo_* one with it.