From f433aa3ad573560c573cf2b0a5165f4265af6e14 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Wed, 13 Nov 2013 10:43:52 +0100
Subject: [PATCH] CDP works now

---
 builtins/target-nvptx64.ll      | 82 +++++++++++++++++++------------
 builtins/util.m4                |  4 --
 ctx.cpp                         | 86 +++++++++++++++++++++++++++++++--
 examples_cuda/aobench/ao_cu.cpp |  2 +-
 4 files changed, 136 insertions(+), 38 deletions(-)

diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll
index 7479a174..2f2f004a 100644
--- a/builtins/target-nvptx64.ll
+++ b/builtins/target-nvptx64.ll
@@ -100,6 +100,34 @@ define i32 @__lanemask_lt() nounwind readnone alwaysinline
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; tasking
 
+define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
+{
+  ret i8* null
+}
+
+;; this call allocate parameter buffer for kernel launch
+declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
+define i8* @ISPCGetParamBuffer(i8**, i64 %align, i64 %size) nounwind alwaysinline
+{
+entry:
+  %call = tail call i32 @__tid_x()
+  %call1 = tail call i32 @__warpsize()
+  %sub = add nsw i32 %call1, -1
+  %and = and i32 %sub, %call
+  %cmp = icmp eq i32 %and, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size);
+  br label %if.end
+
+if.end:
+  %ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ]
+  %ptr = inttoptr i64 %ptri64 to i8*
+  ret i8* %ptr
+}
+
+;; this actually launches kernel a kernel
 module asm "
 .extern .func  (.param .b32 func_retval0) cudaLaunchDevice
 (
@@ -111,22 +139,17 @@ module asm "
   .param .b64 cudaLaunchDevice_param_5
 );
 "
-define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline
-{
-  ret i8* null
-}
-declare i64 @cudaGetParameterBuffer(i64, i64) nounwind
-define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %nargs, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
+define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline
 {
 entry:
-  %func_i64 = ptrtoint i8*  %func_ptr  to i64
-  %args_i64 = ptrtoint i8** %func_args to i64
+;;  only 1 lane must launch the kernel  !!!
+ %func_i64 = ptrtoint i8*  %func_ptr  to i64
+ %args_i64 = ptrtoint i8*  %func_args to i64
+
 ;; nbx = (%ntx-1)/(blocksize/warpsize) + 1  for blocksize=128 & warpsize=32
   %ntxm1   = add nsw i32 %ntx, -1
   %ntxm1d4 = sdiv i32 %ntxm1, 4
   %nbx     = add nsw i32 %ntxm1d4, 1
-
-;;  only 1 lane must launch the kernel  !!!
   %call = tail call i32 @__tid_x()
   %call1 = tail call i32 @__warpsize()
   %sub = add nsw i32 %call1, -1
@@ -135,31 +158,25 @@ entry:
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %if.then, label %if.end
 
-if.then:                                          ; preds = %entry
+if.then:
 
-  %param = call i64 @cudaGetParameterBuffer(i64 8, i64 24);
-  %ptr   = inttoptr i64 %param to i8*;
-
-
-  %res_tmp = call i32 asm sideeffect "{
-      .reg .s32 %r<8>;
-      .reg .s64 %rd<3>;
+ %res_tmp = call i32 asm sideeffect "{
      .param .b64 param0;
-     st.param.b64	[param0+0], $1; //%rd0;
+     st.param.b64	[param0+0], $1;
      .param .b64 param1;
-     st.param.b64	[param1+0], $2; //%rd1;
+     st.param.b64	[param1+0], $2;
      .param .align 4 .b8 param2[12];
-     st.param.b32	[param2+0], $3; //%r0;
-     st.param.b32	[param2+4], $4; //%r1;
-     st.param.b32	[param2+8], $5; //%r2;
+     st.param.b32	[param2+0], $3; 
+     st.param.b32	[param2+4], $4; 
+     st.param.b32	[param2+8], $5; 
      .param .align 4 .b8 param3[12];
-     st.param.b32	[param3+0], $6; //%r3;
-     st.param.b32	[param3+4], $7; //%r4;
-     st.param.b32	[param3+8], $8; //%r5;
+     st.param.b32	[param3+0], $6; 
+     st.param.b32	[param3+4], $7; 
+     st.param.b32	[param3+8], $8; 
      .param .b32 param4;
-     st.param.b32	[param4+0], $9; //%r6;
+     st.param.b32	[param4+0], $9; 
      .param .b64 param5;
-     st.param.b64	[param5+0], $10; //%rd2;
+     st.param.b64	[param5+0], $10; 
 
      .param .b32 retval0;
      call.uni (retval0), 
@@ -175,14 +192,19 @@ if.then:                                          ; preds = %entry
      ld.param.b32	$0, [retval0+0];
   }
   ", 
-"=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64,i64 %args_i64, i32 %nbx,i32 %nty,i32 %ntz, i32 128,i32 1,i32 1, i32 0,i64 0);
+"=r, l,l, r,r,r, r,r,r, r,l"(
+          i64 %func_i64,i64 %args_i64, 
+          i32 %nbx,i32 %nty,i32 %ntz, 
+          i32 128,i32 1,i32 1, i32 0,i64 0);
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
-  %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
+;;  %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ]
 
   ret void
 }
+
+;; this synchronizes kernel
 declare i32 @cudaDeviceSynchronize() nounwind
 define void @ISPCSync(i8*) nounwind alwaysinline
 {
diff --git a/builtins/util.m4 b/builtins/util.m4
index acf45a38..77d18719 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -1835,10 +1835,6 @@ declare void @ISPCLaunch(i8**, i8*, i8*, i32,i32,i32) nounwind
 declare void @ISPCSync(i8*) nounwind
 declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind
 
-declare i8* @CUDAAlloc(i8**, i64, i32) nounwind
-declare void @CUDALaunch(i8**, i8*,i8*,i8*, i8**, i32, i32, i32) nounwind
-declare void @CUDASync(i8*) nounwind
-
 declare i1 @__is_compile_time_constant_mask(<WIDTH x MASK> %mask)
 declare i1 @__is_compile_time_constant_uniform_int32(i32)
 declare i1 @__is_compile_time_constant_varying_int32(<WIDTH x i32>)
diff --git a/ctx.cpp b/ctx.cpp
index 55adb95b..15b4743a 100644
--- a/ctx.cpp
+++ b/ctx.cpp
@@ -3593,9 +3593,87 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
       args.push_back(launchCount[2]);
       return CallInst(flaunch, NULL, args, "");
     }
-    else /* isPTX ==  true */
+    else if(1) /* isPTX ==  true */
     {
-      //assert(0);  /* must only be called in export */
+      if (callee == NULL) {
+        AssertPos(currentPos, m->errorCount > 0);
+        return NULL;
+      }
+      launchedTasks = true;
+      AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
+
+      std::vector<llvm::Type*> argTypes;
+      for (unsigned int i = 0; i < argVals.size(); i++)
+        argTypes.push_back(argVals[i]->getType());
+      llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
+      llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
+      llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
+      if (structSize->getType() != LLVMTypes::Int64Type)
+        structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
+            "struct_size_to_64");
+#if 0
+      {
+        std::string str; llvm::raw_string_ostream rso(str); llvm::formatted_raw_ostream fos(rso);
+        structSize->print(fos);
+        fos.flush(); fprintf(stderr, ">>> %s\n", str.c_str());
+      }
+#endif
+      int align = 8;
+      llvm::Function *falloc = m->module->getFunction("ISPCGetParamBuffer");
+      AssertPos(currentPos, falloc != NULL);
+      std::vector<llvm::Value *> allocArgs;
+      allocArgs.push_back(launchGroupHandlePtr);
+      allocArgs.push_back(LLVMInt64(align));
+      allocArgs.push_back(structSize);
+      llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
+      llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
+      llvm::BasicBlock* if_true  = CreateBasicBlock("if_true");
+      llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
+//      llvm::BasicBlock* bblock_bak = bblock;
+
+      /* check if the pointer returned by ISPCGetParamBuffer is not NULL 
+       * --------------
+       * this is a workaround for not checking which laneIdx we are in,
+       * because ISPCGetParamBuffer will return NULL pointer for all laneIdx, except when laneIdx = 0 
+       * of course, if ISPCGetParamBuffer fails to get parameter buffer, the pointer for laneIdx = 0
+       * will also be zero. 
+       * This check must be added, and also rewrite the code to make it less opaque 
+       */
+      llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
+      BranchInst(if_true, if_false, cmp1);
+
+      bblock = if_true;
+
+      // label_if_then block:
+      llvm::Type *pt = llvm::PointerType::getUnqual(st);
+      llvm::Value *argmem = BitCastInst(voidmem, pt);
+      for (unsigned int i = 0; i < argVals.size(); ++i) 
+      {
+        llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
+        // don't need to do masked store here, I think
+        StoreInst(argVals[i], ptr);
+      }
+      BranchInst(if_false);
+
+      bblock = if_false;
+
+      llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
+      llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
+      AssertPos(currentPos, flaunch != NULL);
+      std::vector<llvm::Value *> args;
+      args.push_back(launchGroupHandlePtr);
+      args.push_back(fptr);
+      args.push_back(voidmem);
+      args.push_back(launchCount[0]);
+      args.push_back(launchCount[1]);
+      args.push_back(launchCount[2]);
+      return CallInst(flaunch, NULL, args, "");
+
+
+    }
+    else
+    {
+      assert(0);  /* must only be called in export */
     //  assert(g->target->getISA() != Target::NVPTX64);
 
       if (callee == NULL) {
@@ -3643,7 +3721,7 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
       // the argument block
 
       /* allocate structure of pointer */
-      llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size());
+      llvm::ArrayType* ArrayTy_6 = llvm::ArrayType::get(LLVMTypes::VoidPointerType, argVals.size()*2);
       llvm::Value* ptrParam = AllocaInst(ArrayTy_6, "arrayStructPtr");
 
       /* constructed array of pointers to arguments 
@@ -3786,6 +3864,8 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
         args.push_back(ptr_arraydecay); /* const void ** params */
       }
 
+      llvm::ConstantInt* const_int64_10 = llvm::ConstantInt::get(*g->ctx, llvm::APInt(32, argVals.size()*2));
+      args.push_back(const_int64_10);
       args.push_back(launchCount[0]);
       args.push_back(launchCount[1]);
       args.push_back(launchCount[2]);
diff --git a/examples_cuda/aobench/ao_cu.cpp b/examples_cuda/aobench/ao_cu.cpp
index 85eb4cb5..75599078 100755
--- a/examples_cuda/aobench/ao_cu.cpp
+++ b/examples_cuda/aobench/ao_cu.cpp
@@ -163,7 +163,7 @@ CUmodule loadModule(const char * module)
     CUjit_option options[nOptions];
     void* optionVals[nOptions];
     float walltime;
-    const unsigned int logSize = 8192;
+    const unsigned int logSize = 32768;
     char error_log[logSize],
          info_log[logSize];
     void *cuOut;