From 05a5a42a080e98ec52c94221f2e0c5750cea5abd Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt@pharr.org>
Date: Fri, 14 Sep 2012 12:17:25 -0700
Subject: [PATCH 1/4] Don't force loads/stores from varying types to be
 unaligned.

These should always actually be aligned in memory.
---
 ctx.cpp | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/ctx.cpp b/ctx.cpp
index fec38065..a066679b 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -2397,16 +2397,7 @@ FunctionEmitContext::LoadInst(llvm::Value *ptr, const char *name) {
     if (name == NULL)
         name = LLVMGetName(ptr, "_load");
 
-    // FIXME: it's not clear to me that we generate unaligned vector loads
-    // of varying stuff out of the front-end any more.  (Only by the
-    // optimization passes that lower gathers to vector loads, I think..)
-    // So remove this??
-    int align = 0;
-    if (llvm::isa<llvm::VectorType>(pt->getElementType()))
-        align = 1;
-    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, 
-                                                 false /* not volatile */,
-                                                 align, bblock);
+    llvm::Instruction *inst = new llvm::LoadInst(ptr, name, bblock);
     AddDebugPos(inst);
     return inst;
 }
@@ -2958,17 +2949,7 @@ FunctionEmitContext::StoreInst(llvm::Value *value, llvm::Value *ptr) {
         return;
     }
 
-    llvm::Instruction *inst;
-    if (llvm::isa<llvm::VectorType>(value->getType()))
-        // FIXME: same for load--do we still need/want this??
-        // Specify an unaligned store, since we don't know that the ptr
-        // will in fact be aligned to a vector width here.  (Actually
-        // should be aligned to the alignment of the vector elment type...)
-        inst = new llvm::StoreInst(value, ptr, false /* not volatile */,
-                                   1, bblock);
-    else
-        inst = new llvm::StoreInst(value, ptr, bblock);
-
+    llvm::Instruction *inst = new llvm::StoreInst(value, ptr, bblock);
     AddDebugPos(inst);
 }
 

From 59b0a2b208ffe6201e8c7a81e0b120290ed7becc Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 14 Sep 2012 13:32:42 -0700
Subject: [PATCH 2/4] Mark __any(), __all(), and __none() as internal after
 they're linked in.

This fixes multiple symbol definition errors when compiling a single binary
for multiple ISA targets.
---
 builtins.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/builtins.cpp b/builtins.cpp
index e75f2107..81a9a64b 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -322,6 +322,8 @@ lSetInternalFunctions(llvm::Module *module) {
         "__add_varying_double",
         "__add_varying_int32",
         "__add_varying_int64",
+        "__all",
+        "__any",
         "__aos_to_soa3_float",
         "__aos_to_soa3_float16",
         "__aos_to_soa3_float4",
@@ -465,6 +467,7 @@ lSetInternalFunctions(llvm::Module *module) {
         "__new_uniform",
         "__new_varying32",
         "__new_varying64",
+        "__none",
         "__num_cores",
         "__packed_load_active",
         "__packed_store_active",

From be2108260ea66d7a0e64a43d83ebf8444a3b18c7 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 14 Sep 2012 13:49:45 -0700
Subject: [PATCH 3/4] Add --opt=force-aligned-memory option.

This forces all vector loads/stores to be done assuming that the given
pointer is aligned to the vector size, thus allowing the use of sometimes
more-efficient instructions.  (If it isn't the case that the memory is
aligned, the program will fail!).
---
 ispc.cpp        |  1 +
 ispc.h          |  6 ++++++
 main.cpp        |  3 +++
 module.cpp      |  2 ++
 opt.cpp         | 17 +++++++++++++----
 test_static.cpp | 14 ++++++++++----
 6 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 29801359..0aae5e90 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -640,6 +640,7 @@ Opt::Opt() {
     unrollLoops = true;
     disableAsserts = false;
     disableFMA = false;
+    forceAlignedMemory = false;
     disableMaskAllOnOptimizations = false;
     disableHandlePseudoMemoryOps = false;
     disableBlendedMaskedStores = false;
diff --git a/ispc.h b/ispc.h
index e376df46..045916ab 100644
--- a/ispc.h
+++ b/ispc.h
@@ -311,6 +311,12 @@ struct Opt {
         that support them). */
     bool disableFMA;
 
+    /** Always generate aligned vector load/store instructions; this
+        implies a guarantee that all dynamic access through pointers that
+        becomes a vector load/store will be a cache-aligned sequence of
+        locations. */
+    bool forceAlignedMemory;
+
     /** If enabled, disables the various optimizations that kick in when
         the execution mask can be determined to be "all on" at compile
         time. */
diff --git a/main.cpp b/main.cpp
index 63c4d572..8076456f 100644
--- a/main.cpp
+++ b/main.cpp
@@ -119,6 +119,7 @@ usage(int ret) {
     printf("        disable-loop-unroll\t\tDisable loop unrolling.\n");
     printf("        fast-masked-vload\t\tFaster masked vector loads on SSE (may go past end of array)\n");
     printf("        fast-math\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n");
+    printf("        force-aligned-memory\t\tAlways issue \"aligned\" vector load and store instructions\n");
 #ifndef ISPC_IS_WINDOWS
     printf("    [--pic]\t\t\t\tGenerate position-independent code\n");
 #endif // !ISPC_IS_WINDOWS
@@ -336,6 +337,8 @@ int main(int Argc, char *Argv[]) {
                 g->opt.unrollLoops = false;
             else if (!strcmp(opt, "disable-fma"))
                 g->opt.disableFMA = true;
+            else if (!strcmp(opt, "force-aligned-memory"))
+                g->opt.forceAlignedMemory = true;
 
             // These are only used for performance tests of specific
             // optimizations
diff --git a/module.cpp b/module.cpp
index 365653ce..9ba7cea9 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1783,6 +1783,8 @@ Module::execPreprocessor(const char* infilename, llvm::raw_string_ostream* ostre
         opts.addMacroDef("ISPC_TARGET_HAS_HALF");
     if (g->target.hasTranscendentals)
         opts.addMacroDef("ISPC_TARGET_HAS_TRANSCENDENTALS");
+    if (g->opt.forceAlignedMemory)
+        opts.addMacroDef("ISPC_FORCE_ALIGNED_MEMORY");
 
     opts.addMacroDef("ISPC_MAJOR_VERSION=1");
     opts.addMacroDef("ISPC_MINOR_VERSION=3");
diff --git a/opt.cpp b/opt.cpp
index a623466b..e2f38d8d 100644
--- a/opt.cpp
+++ b/opt.cpp
@@ -791,7 +791,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
                                               llvm::PointerType::get(returnType, 0), 
                                               name, callInst);
                     lCopyMetadata(castPtr, callInst);
-                    int align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
+                    int align;
+                    if (g->opt.forceAlignedMemory)
+                        align = 0;
+                    else
+                        align = callInst->getCalledFunction() == avxMaskedLoad32 ? 4 : 8;
                     name = LLVMGetName(callInst->getArgOperand(0), "_load");
                     llvm::Instruction *loadInst = 
                         new llvm::LoadInst(castPtr, name, false /* not volatile */,
@@ -829,7 +833,11 @@ IntrinsicsOpt::runOnBasicBlock(llvm::BasicBlock &bb) {
 
                     llvm::StoreInst *storeInst = 
                         new llvm::StoreInst(rvalue, castPtr, (llvm::Instruction *)NULL);
-                    int align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
+                    int align;
+                    if (g->opt.forceAlignedMemory)
+                        align = 0;
+                    else
+                        align = callInst->getCalledFunction() == avxMaskedStore32 ? 4 : 8;
                     storeInst->setAlignment(align);
                     lCopyMetadata(storeInst, callInst);
                     llvm::ReplaceInstWithInst(callInst, storeInst);
@@ -2553,7 +2561,7 @@ lImproveMaskedStore(llvm::CallInst *callInst) {
         lCopyMetadata(lvalue, callInst);
         llvm::Instruction *store = 
             new llvm::StoreInst(rvalue, lvalue, false /* not volatile */,
-                                info->align);
+                                g->opt.forceAlignedMemory ? 0 : info->align);
         lCopyMetadata(store, callInst);
         llvm::ReplaceInstWithInst(callInst, store);
         return true;
@@ -2616,7 +2624,8 @@ lImproveMaskedLoad(llvm::CallInst *callInst,
                                     callInst);
         llvm::Instruction *load = 
             new llvm::LoadInst(ptr, callInst->getName(), false /* not volatile */,
-                               info->align, (llvm::Instruction *)NULL);
+                               g->opt.forceAlignedMemory ? 0 : info->align,
+                               (llvm::Instruction *)NULL);
         lCopyMetadata(load, callInst);
         llvm::ReplaceInstWithInst(callInst, load);
         return true;
diff --git a/test_static.cpp b/test_static.cpp
index e798f960..ec91960e 100644
--- a/test_static.cpp
+++ b/test_static.cpp
@@ -99,15 +99,21 @@ void *ISPCAlloc(void **handle, int64_t size, int32_t alignment) {
 }
 
 
+#if defined(_WIN32) || defined(_WIN64)
+#define ALIGN
+#else
+#define ALIGN __attribute__((aligned(64)))
+#endif
 
 int main(int argc, char *argv[]) {
     int w = width();
     assert(w <= 64);
 
-    float returned_result[64];
-    float vfloat[64];
-    double vdouble[64];
-    int vint[64], vint2[64];
+    float returned_result[64] ALIGN;
+    float vfloat[64] ALIGN;
+    double vdouble[64] ALIGN;
+    int vint[64] ALIGN;
+    int vint2[64] ALIGN;
 
     for (int i = 0; i < 64; ++i) {
         returned_result[i] = -1e20;

From a13e7f24358be3cd661b7f4f81e85c69730fd2b8 Mon Sep 17 00:00:00 2001
From: Matt Pharr <matt.pharr@intel.com>
Date: Fri, 14 Sep 2012 13:53:12 -0700
Subject: [PATCH 4/4] #define ISPC_FORCE_ALIGNED_MEMORY, if appropriate, in C++
 output.

---
 cbackend.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cbackend.cpp b/cbackend.cpp
index cb6236bc..b49e1d10 100644
--- a/cbackend.cpp
+++ b/cbackend.cpp
@@ -2203,6 +2203,10 @@ bool CWriter::doInitialization(llvm::Module &M) {
       Out << "#undef ISPC_FAST_MATH\n";
   }
 
+  if (g->opt.forceAlignedMemory) {
+      Out << "#define ISPC_FORCE_ALIGNED_MEMORY\n";
+  }
+
   Out << "#include \"" << includeName << "\"\n";
 
   Out << "\n/* Basic Library Function Declarations */\n";