From 95950885cf519ff46dcb0a93531f7e7e8427f89e Mon Sep 17 00:00:00 2001
From: Dmitry Babokin <babokin@gmail.com>
Date: Fri, 26 Apr 2013 20:33:24 +0400
Subject: [PATCH] Use posix_memalign to allocate 16 byte alligned memeory on
 Linux/MacOS.

---
 Makefile         |  2 +-
 builtins.cpp     |  8 ++++---
 builtins/util.m4 | 59 ++++++++++++++++++++++++++++++++++++++++++++----
 expr.cpp         | 18 +++++++++++----
 ispc.cpp         |  3 +++
 main.cpp         |  1 +
 6 files changed, 77 insertions(+), 14 deletions(-)
diff --git a/Makefile b/Makefile
index 34055496..69468576 100644
--- a/Makefile
+++ b/Makefile
@@ -182,7 +182,7 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 
 objs/builtins-%.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
-	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) $< | python bitcode2cpp.py $< > $@
+	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
 objs/builtins-c-32.cpp: builtins/builtins.c
 	@echo Creating C++ source from builtins definition file $<
diff --git a/builtins.cpp b/builtins.cpp
index 53cab1f6..e5745372 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -477,9 +477,11 @@ lSetInternalFunctions(llvm::Module *module) {
         "__min_varying_uint32",
         "__min_varying_uint64",
         "__movmsk",
-        "__new_uniform",
-        "__new_varying32",
-        "__new_varying64",
+        "__new_uniform_32rt",
+        "__new_uniform_64rt",
+        "__new_varying32_32rt",
+        "__new_varying32_64rt",
+        "__new_varying64_64rt",
         "__none",
         "__num_cores",
         "__packed_load_active",
diff --git a/builtins/util.m4 b/builtins/util.m4
index 87a1fd68..ac9ba2d0 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -2536,15 +2536,59 @@ ok:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; new/delete
 
-declare noalias i8 * @malloc(i64)
-declare void @free(i8 *)
+;; Set of function for 32 bit runtime
 
-define noalias i8 * @__new_uniform(i64 %size) {
+ifelse(BUILD_OS, `UNIX', 
+`
+
+;; posix_memalign is for 32 bit runtime
+declare i32 @posix_memalign(i8**, i32, i32)
+
+define noalias i8 * @__new_uniform_32rt(i64 %size) {
+  %ptr = alloca i8*
+  %conv = trunc i64 %size to i32
+  %call1 = call i32 @posix_memalign(i8** %ptr, i32 16, i32 %conv)
+  %ptr_val = load i8** %ptr
+  ret i8* %ptr_val
+}
+
+define <WIDTH x i64> @__new_varying32_32rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+  %ret = alloca <WIDTH x i64>
+  store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
+  %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
+
+  per_lane(WIDTH, <WIDTH x MASK> %mask, `
+    %sz_LANE_ID = extractelement <WIDTH x i32> %size, i32 LANE
+    %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE
+    %ptr_LANE_ID = bitcast i64* %store_LANE_ID to i8**
+    %call_LANE_ID = call i32 @posix_memalign(i8** %ptr_LANE_ID, i32 16, i32 %sz_LANE_ID)')
+
+  %r = load <WIDTH x i64> * %ret
+  ret <WIDTH x i64> %r
+}
+
+',
+BUILD_OS, `WINDOWS',
+`
+;; Windows version TBD
+',
+`
+errprint(`BUILD_OS should be defined to either UNIX or WINDOWS
+')
+m4exit(`1')
+')
+
+;; Set of function for 64 bit runtime
+
+;; malloc is for 64 bit runtime
+declare noalias i8 * @malloc(i64)
+
+define noalias i8 * @__new_uniform_64rt(i64 %size) {
   %a = call noalias i8 * @malloc(i64 %size)
   ret i8 * %a
 }
 
-define <WIDTH x i64> @__new_varying32(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
+define <WIDTH x i64> @__new_varying32_64rt(<WIDTH x i32> %size, <WIDTH x MASK> %mask) {
   %ret = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
   %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
@@ -2561,7 +2605,7 @@ define <WIDTH x i64> @__new_varying32(<WIDTH x i32> %size, <WIDTH x MASK> %mask)
   ret <WIDTH x i64> %r
 }
 
-define <WIDTH x i64> @__new_varying64(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
+define <WIDTH x i64> @__new_varying64_64rt(<WIDTH x i64> %size, <WIDTH x MASK> %mask) {
   %ret = alloca <WIDTH x i64>
   store <WIDTH x i64> zeroinitializer, <WIDTH x i64> * %ret
   %ret64 = bitcast <WIDTH x i64> * %ret to i64 *
@@ -2577,6 +2621,11 @@ define <WIDTH x i64> @__new_varying64(<WIDTH x i64> %size, <WIDTH x MASK> %mask)
   ret <WIDTH x i64> %r
 }
 
+;; Functions for both 32 and 64 bit runtimes.
+
+;; free works fine with both 32 and 64 bit runtime
+declare void @free(i8 *)
+
 define void @__delete_uniform(i8 * %ptr) {
   call void @free(i8 * %ptr)
   ret void
diff --git a/expr.cpp b/expr.cpp
index 7808d2af..27f45299 100644
--- a/expr.cpp
+++ b/expr.cpp
@@ -8214,16 +8214,24 @@ NewExpr::GetValue(FunctionEmitContext *ctx) const {
     // varying, and taking 32-bit or 64-bit allocation counts.
     llvm::Function *func;
     if (isVarying) {
-        if (do32Bit)
-            func = m->module->getFunction("__new_varying32");
-        else
-            func = m->module->getFunction("__new_varying64");
+        if (g->target->is32Bit()) {
+            func = m->module->getFunction("__new_varying32_32rt");
+        } else if (g->opt.force32BitAddressing) {
+            func = m->module->getFunction("__new_varying32_64rt");
+        } else {
+            func = m->module->getFunction("__new_varying64_64rt");
+        }
     }
     else {
+        // FIXME: __new_uniform_32rt should take i32
         if (allocSize->getType() != LLVMTypes::Int64Type)
             allocSize = ctx->SExtInst(allocSize, LLVMTypes::Int64Type,
                                       "alloc_size64");
-        func = m->module->getFunction("__new_uniform");
+        if (g->target->is32Bit()) {
+            func = m->module->getFunction("__new_uniform_32rt");
+        } else {
+            func = m->module->getFunction("__new_uniform_64rt");
+        }
     }
     AssertPos(pos, func != NULL);
 
diff --git a/ispc.cpp b/ispc.cpp
index 63b66c9c..daa3f5a8 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -477,6 +477,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
 #endif
 
         // Set is32Bit
+        // This indicates if we are compiling for 32 bit platform
+        // and can assume 32 bit runtime.
+        // FIXME: all generic targets are handled as 64 bit, which is incorrect.
         this->m_is32Bit = (getDataLayout()->getPointerSize() == 4);
 
 #if !defined(LLVM_3_1) && !defined(LLVM_3_2)
diff --git a/main.cpp b/main.cpp
index 4a970ac8..45d5f1ee 100644
--- a/main.cpp
+++ b/main.cpp
@@ -272,6 +272,7 @@ int main(int Argc, char *Argv[]) {
             g->cppArgs.push_back(argv[i]);
         else if (!strncmp(argv[i], "--addressing=", 13)) {
             if (atoi(argv[i] + 13) == 64)
+                // FIXME: this doesn't make sense on 32 bit platform.
                 g->opt.force32BitAddressing = false;
             else if (atoi(argv[i] + 13) == 32)
                 g->opt.force32BitAddressing = true;