From 1d91a626f267bc12656a64cac861128cfde7a5a0 Mon Sep 17 00:00:00 2001 From: Evghenii Date: Tue, 12 Nov 2013 17:02:31 +0100 Subject: [PATCH] ISPC sync is not added --- builtins/target-nvptx64.ll | 30 ++++++++++++++++++++++++------ examples_cuda/aobench/ao1.ispc | 9 +++++++-- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index e60c8f05..cdde3532 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -24,8 +24,9 @@ define i32 @__tid_x() nounwind readnone alwaysinline } define i32 @__warpsize() nounwind readnone alwaysinline { - %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() - ret i32 %tid +;; %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +;; ret i32 %tid + ret i32 32 } @@ -116,14 +117,26 @@ define i8* @ISPCAlloc(i8**, i64, i32) nounwind alwaysinline } define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline { +entry: %func_i64 = ptrtoint i8* %func_ptr to i64 %args_i64 = ptrtoint i8** %func_args to i64 ;; nbx = (%ntx-1)/(blocksize/warpsize) + 1 for blocksize=128 & warpsize=32 - %sub = add nsw i32 %ntx, -1 - %div = sdiv i32 %sub, 4 - %nbx = add nsw i32 %div, 1 + %ntxm1 = add nsw i32 %ntx, -1 + %ntxm1d4 = sdiv i32 %ntxm1, 4 + %nbx = add nsw i32 %ntxm1d4, 1 - %res = call i32 asm sideeffect "{ +;; only 1 lane must launch the kernel !!! + %call = tail call i32 @__tid_x() + %call1 = tail call i32 @__warpsize() + %sub = add nsw i32 %call1, -1 + %and = and i32 %sub, %call +;; if (laneIdx == 0) + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + + %res_tmp = call i32 asm sideeffect "{ .reg .s32 %r<8>; .reg .s64 %rd<3>; .param .b64 param0; @@ -158,6 +171,11 @@ define void @ISPCLaunch(i8**, i8* %func_ptr, i8** %func_args, i32 %ntx, i32 %nty } ", "=r, l,l, r,r,r, r,r,r, r,l"(i64 %func_i64,i64 %args_i64, i32 %nbx,i32 %nty,i32 %ntz, i32 128,i32 1,i32 1, i32 0,i64 0); + br label %if.end + +if.end: ; preds = %if.then, %entry + %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ] + ret void } declare i32 @cudaDeviceSynchronize() nounwind diff --git a/examples_cuda/aobench/ao1.ispc b/examples_cuda/aobench/ao1.ispc index bd46de06..a6ed8c85 100644 --- a/examples_cuda/aobench/ao1.ispc +++ b/examples_cuda/aobench/ao1.ispc @@ -286,7 +286,11 @@ static inline void ao_tile( #define TILEX 64 #define TILEY 4 -/* task will generate " __global__ " only */ +/* unless task/export is specified all functions + * are generated as mangled "__device__" functions + */ + +/* task will generate mangled "__global__" function only */ void task ao_task(uniform int width, uniform int height, uniform int nsubsamples, uniform float image[]) { @@ -302,11 +306,12 @@ void task ao_task(uniform int width, uniform int height, } -/* export will generate "extern "C" __global__" an "__device__" */ +/* export will generate unmangled "extern "C" __global__" and mangled "__device__" */ export void ao_ispc_tasks(uniform int w, uniform int h, uniform int nsubsamples, uniform float image[]) { const uniform int ntilex = (w+TILEX-1)/TILEX; const uniform int ntiley = (h+TILEY-1)/TILEY; launch[ntilex,ntiley] ao_task(w, h, nsubsamples, image); + sync; }