diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index fa719ba9..d43e5d4a 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -61,32 +61,32 @@ define i32 @__nctaid_z() nounwind readnone alwaysinline %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ret i32 %nb } -define i32 @__shfl_i32(i32, i32) nounwind readnone alwaysinline +define i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline { %shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline ret i32 %shfl } -define float @__shfl_xor_float(float, i32) nounwind readnone alwaysinline +define float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline { %shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline ret float %shfl } -define i32 @__shfl_xor_i32(i32, i32) nounwind readnone alwaysinline +define i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline { %shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline ret i32 %shfl } -define float @__fminf(float,float) nounwind readnone alwaysinline +define float @__fminf_nvptx(float,float) nounwind readnone alwaysinline { %min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline ret float %min } -define float @__fmaxf(float,float) nounwind readnone alwaysinline +define float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline { %max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline ret float %max } -define i32 @__ballot(i1) nounwind readnone alwaysinline +define i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline { %conv = zext i1 %0 to i32 %res = tail call i32 asm sideeffect @@ -96,7 +96,7 @@ define i32 @__ballot(i1) nounwind readnone alwaysinline }", "=r,r"(i32 %conv) nounwind readnone alwaysinline ret i32 %res } -define i32 @__lanemask_lt() nounwind readnone alwaysinline +define i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline { %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline ret i32 %mask @@ -576,9 +576,9 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { define i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline { entry: - %call = call i32 @__ballot(i1 zeroext %p) + %call = call i32 @__ballot_nvptx(i1 zeroext %p) %call1 = call i32 @__popcnt_int32(i32 %call) - %call2 = call i32 @__lanemask_lt() + %call2 = call i32 @__lanemask_lt_nvptx() %and = and i32 %call2, %call %call3 = call i32 @__popcnt_int32(i32 %and) %retval.sroa.1.4.insert.ext.i = zext i32 %call3 to i64 @@ -617,21 +617,21 @@ define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %res = call i32 @__ballot(i1 %v) + %res = call i32 @__ballot_nvptx(i1 %v) %cmp = icmp ne i32 %res, 0 ret i1 %cmp } define i1 @__all(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %res = call i32 @__ballot(i1 %v) + %res = call i32 @__ballot_nvptx(i1 %v) %cmp = icmp eq i32 %res, 31 ret i1 %cmp } define i1 @__none(<1 x i1>) nounwind readnone alwaysinline { %v = extractelement <1 x i1> %0, i32 0 - %res = call i32 @__ballot(i1 %v) + %res = call i32 @__ballot_nvptx(i1 %v) %cmp = icmp eq i32 %res, 0 ret i1 %cmp } @@ -646,47 +646,47 @@ define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline define float @__reduce_min_float(<1 x float>) nounwind readnone { %value = extractelement <1 x float> %0, i32 0 - %call = tail call float @__shfl_xor_float(float %value, i32 16) - %call1 = tail call float @__fminf(float %value, float %call) - %call.1 = tail call float @__shfl_xor_float(float %call1, i32 8) - %call1.1 = tail call float @__fminf(float %call1, float %call.1) - %call.2 = tail call float @__shfl_xor_float(float %call1.1, i32 4) - %call1.2 = tail call float @__fminf(float %call1.1, float %call.2) - %call.3 = tail call float @__shfl_xor_float(float %call1.2, i32 2) - %call1.3 = tail call float @__fminf(float %call1.2, float %call.3) - %call.4 = tail call float @__shfl_xor_float(float %call1.3, i32 1) - %call1.4 = tail call float @__fminf(float %call1.3, float %call.4) + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = tail call float @__fminf_nvptx(float %value, float %call) + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = tail call float @__fminf_nvptx(float %call1, float %call.1) + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = tail call float @__fminf_nvptx(float %call1.1, float %call.2) + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = tail call float @__fminf_nvptx(float %call1.2, float %call.3) + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4) ret float %call1.4 } define float @__reduce_max_float(<1 x float>) nounwind readnone { %value = extractelement <1 x float> %0, i32 0 - %call = tail call float @__shfl_xor_float(float %value, i32 16) - %call1 = tail call float @__fmaxf(float %value, float %call) - %call.1 = tail call float @__shfl_xor_float(float %call1, i32 8) - %call1.1 = tail call float @__fmaxf(float %call1, float %call.1) - %call.2 = tail call float @__shfl_xor_float(float %call1.1, i32 4) - %call1.2 = tail call float @__fmaxf(float %call1.1, float %call.2) - %call.3 = tail call float @__shfl_xor_float(float %call1.2, i32 2) - %call1.3 = tail call float @__fmaxf(float %call1.2, float %call.3) - %call.4 = tail call float @__shfl_xor_float(float %call1.3, i32 1) - %call1.4 = tail call float @__fmaxf(float %call1.3, float %call.4) + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = tail call float @__fmaxf_nvptx(float %value, float %call) + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = tail call float @__fmaxf_nvptx(float %call1, float %call.1) + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = tail call float @__fmaxf_nvptx(float %call1.1, float %call.2) + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = tail call float @__fmaxf_nvptx(float %call1.2, float %call.3) + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = tail call float @__fmaxf_nvptx(float %call1.3, float %call.4) ret float %call1.4 } define i32 @__reduce_add_int32(<1 x i32>) nounwind readnone { %value = extractelement <1 x i32> %0, i32 0 - %call = tail call i32 @__shfl_xor_i32(i32 %value, i32 16) + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) %call1 = add i32 %call, %value - %call.1 = tail call i32 @__shfl_xor_i32(i32 %call1, i32 8) + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) %call1.1 =add i32 %call1, %call.1 - %call.2 = tail call i32 @__shfl_xor_i32(i32 %call1.1, i32 4) + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) %call1.2 = add i32 %call1.1, %call.2 - %call.3 = tail call i32 @__shfl_xor_i32(i32 %call1.2, i32 2) + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) %call1.3 = add i32 %call1.2, %call.3 - %call.4 = tail call i32 @__shfl_xor_i32(i32 %call1.3, i32 1) + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) %call1.4 = add i32 %call1.3, %call.4 ret i32 %call1.4 } diff --git a/builtins/util_ptx.m4 b/builtins/util_ptx.m4 index eddf5deb..c948cfc3 100644 --- a/builtins/util_ptx.m4 +++ b/builtins/util_ptx.m4 @@ -2924,11 +2924,11 @@ if.then: ; preds = %entry if.end: ; preds = %if.then, %entry %ptr.0 = phi i64 [ %phitmp, %if.then ], [ undef, %entry ] %val.sroa.0.0.extract.trunc = trunc i64 %ptr.0 to i32 - %call3 = tail call i32 @__shfl_i32(i32 %val.sroa.0.0.extract.trunc, i32 0) + %call3 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.0.extract.trunc, i32 0) %val.sroa.0.0.insert.ext = zext i32 %call3 to i64 %val.sroa.0.4.extract.shift = lshr i64 %ptr.0, 32 %val.sroa.0.4.extract.trunc = trunc i64 %val.sroa.0.4.extract.shift to i32 - %call8 = tail call i32 @__shfl_i32(i32 %val.sroa.0.4.extract.trunc, i32 0) + %call8 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.4.extract.trunc, i32 0) %val.sroa.0.4.insert.ext = zext i32 %call8 to i64 %val.sroa.0.4.insert.shift = shl nuw i64 %val.sroa.0.4.insert.ext, 32 %val.sroa.0.4.insert.insert = or i64 %val.sroa.0.4.insert.shift, %val.sroa.0.0.insert.ext diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile index 51866b32..cfbad4c1 100644 --- a/examples/mandelbrot_tasks/Makefile +++ b/examples/mandelbrot_tasks/Makefile @@ -2,7 +2,7 @@ EXAMPLE=mandelbrot_tasks CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp ISPC_SRC=mandelbrot_tasks.ispc -ISPC_IA_TARGETS=sse2-i32x4,sse4-i32x8,avx1-i32x16,avx2-i32x16 +ISPC_IA_TARGETS=avx1-i32x16 ISPC_ARM_TARGETS=neon include ../common.mk diff --git a/examples_cuda/mandelbrot_tasks3d/_cuobj/Makefile b/examples_cuda/mandelbrot_tasks3d/_cuobj/Makefile index 645577a5..164a33e6 100644 --- a/examples_cuda/mandelbrot_tasks3d/_cuobj/Makefile +++ b/examples_cuda/mandelbrot_tasks3d/_cuobj/Makefile @@ -5,7 +5,7 @@ all: $(LIB) $(LIB) : $(FILE).cu - nvcc -dc $(FILE).cu -arch=sm_35 -dryrun 2>&1 | sed 's/\#\$$//g'|awk '{ if ($$1 == "cicc") print "cp ../__kernels.ptx " $$NF; else print $0 }' > run.sh + nvcc -dc $(FILE).cu -arch=sm_35 -Xptxas=-v -dryrun 2>&1 | sed 's/\#\$$//g'|awk '{ if ($$1 == "cicc") print "cp ../__kernels.ptx " $$NF; else print $0 }' > run.sh sh run.sh nvcc -dlink -o $(FILE)_dlink.o $(FILE).o -lcudadevrt -arch=sm_35 nvcc $(FILE).o $(FILE)_dlink.o --lib -o lib$(FILE)_cudart.a diff --git a/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx b/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx deleted file mode 100644 index 356bb887..00000000 --- a/examples_cuda/mandelbrot_tasks3d/kernel_cu.ptx +++ /dev/null @@ -1,715 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Nov 5 22:34:47 2013 (1383687287) -// Cuda compilation tools, release 6.0, V6.0.1 -// - -.version 4.0 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach[32]; -.global .align 1 .b8 constDeltaForeach3[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - -.visible .func PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7, - .param .b32 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8, - .param .b64 mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9 -) -{ - .reg .pred %p<110>; - .reg .f32 %f<77>; - .reg .s32 %r<104>; - .reg .s64 %rd<13>; - - - ld.param.f32 %f34, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0]; - ld.param.f32 %f35, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1]; - ld.param.f32 %f36, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2]; - ld.param.f32 %f37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3]; - ld.param.u32 %r37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4]; - ld.param.u32 %r38, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5]; - ld.param.u32 %r41, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6]; - ld.param.u32 %r39, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7]; - ld.param.u32 %r40, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8]; - ld.param.u64 %rd3, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9]; - mov.u32 %r42, %ctaid.x; - shl.b32 %r43, %r42, 2; - mov.u32 %r44, %tid.x; - shr.s32 %r45, %r44, 5; - add.s32 %r46, %r45, %r43; - mul.lo.s32 %r1, %r46, %r41; - add.s32 %r47, %r1, %r41; - min.s32 %r2, %r47, %r37; - mov.u32 %r3, %ctaid.y; - mul.lo.s32 %r96, %r3, %r39; - add.s32 %r48, %r96, %r39; - min.s32 %r49, %r48, %r38; - sub.s32 %r50, %r2, %r1; - shr.s32 %r51, %r50, 31; - shr.u32 %r52, %r51, 27; - add.s32 %r53, %r50, %r52; - and.b32 %r54, %r53, -32; - sub.s32 %r55, %r50, %r54; - sub.s32 %r5, %r2, %r55; - and.b32 %r56, %r44, 31; - cvt.u64.u32 %rd1, %r56; - mov.u64 %rd4, constDeltaForeach; - add.s64 %rd5, %rd4, %rd1; - ld.global.u8 %r6, [%rd5]; - add.s32 %r94, %r96, %r6; - setp.ge.s32 %p29, %r96, %r49; - @%p29 bra BB0_26; - - setp.lt.s32 %p30, %r1, %r5; - setp.gt.s32 %p1, %r40, 0; - add.s32 %r57, %r3, 1; - mul.lo.s32 %r58, %r57, %r39; - not.b32 %r59, %r58; - not.b32 %r60, %r38; - max.s32 %r61, %r60, %r59; - not.b32 %r10, %r61; - mov.u64 %rd6, constDeltaForeach3; - add.s64 %rd2, %rd6, %rd1; - @%p30 bra BB0_11; - - mov.u32 %r95, %r94; - -BB0_3: - setp.ge.s32 %p31, %r1, %r2; - @%p31 bra BB0_10; - - setp.gt.s32 %p34, %r40, 0; - mov.u32 %r91, 0; - ld.global.u8 %r66, [%rd2]; - add.s32 %r14, %r1, %r66; - setp.lt.s32 %p35, %r14, %r2; - cvt.rn.f32.s32 %f38, %r14; - fma.rn.ftz.f32 %f5, %f35, %f38, %f34; - cvt.rn.f32.s32 %f39, %r95; - fma.rn.ftz.f32 %f6, %f37, %f39, %f36; - and.pred %p36, %p34, %p35; - selp.u32 %r63, 1, 0, %p36; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r63, 0; - vote.ballot.b32 %r62, %p1; - } - // inline asm - setp.eq.s32 %p37, %r62, 0; - mov.u32 %r92, %r91; - mov.pred %p33, 0; - mov.pred %p86, -1; - mov.pred %p88, %p33; - mov.f32 %f54, %f5; - mov.f32 %f58, %f6; - mov.pred %p103, %p1; - mov.pred %p104, %p1; - @%p37 bra BB0_8; - -BB0_5: - mov.pred %p2, %p104; - mov.f32 %f56, %f58; - mov.f32 %f59, %f56; - mov.f32 %f52, %f54; - mov.f32 %f55, %f52; - mov.pred %p5, %p88; - mul.ftz.f32 %f9, %f59, %f59; - mul.ftz.f32 %f10, %f55, %f55; - add.ftz.f32 %f40, %f9, %f10; - setp.gtu.ftz.f32 %p39, %f40, 0f40800000; - and.pred %p40, %p2, %p39; - or.pred %p6, %p40, %p5; - setp.ge.s32 %p41, %r14, %r2; - xor.pred %p42, %p6, %p2; - not.pred %p43, %p42; - or.pred %p44, %p43, %p41; - mov.pred %p87, %p33; - @%p44 bra BB0_7; - - and.pred %p45, %p86, %p103; - not.pred %p46, %p6; - add.ftz.f32 %f41, %f55, %f55; - sub.ftz.f32 %f42, %f10, %f9; - fma.rn.ftz.f32 %f59, %f59, %f41, %f6; - add.ftz.f32 %f55, %f5, %f42; - and.pred %p87, %p45, %p46; - -BB0_7: - mov.f32 %f13, %f59; - mov.f32 %f14, %f55; - mov.pred %p86, %p87; - add.s32 %r69, %r92, 1; - selp.b32 %r91, %r69, %r92, %p86; - setp.lt.s32 %p103, %r91, %r40; - and.pred %p10, %p86, %p103; - and.pred %p48, %p10, %p35; - selp.u32 %r68, 1, 0, %p48; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r68, 0; - vote.ballot.b32 %r67, %p1; - } - // inline asm - setp.ne.s32 %p49, %r67, 0; - mov.pred %p88, %p6; - mov.f32 %f54, %f14; - mov.f32 %f58, %f13; - mov.u32 %r92, %r91; - mov.pred %p104, %p10; - @%p49 bra BB0_5; - -BB0_8: - setp.ge.s32 %p50, %r14, %r2; - @%p50 bra BB0_10; - - mad.lo.s32 %r70, %r95, %r37, %r14; - shl.b32 %r71, %r70, 2; - cvt.s64.s32 %rd7, %r71; - add.s64 %rd8, %rd7, %rd3; - st.u32 [%rd8], %r91; - -BB0_10: - add.s32 %r96, %r96, 1; - add.s32 %r95, %r96, %r6; - setp.eq.s32 %p51, %r96, %r10; - @%p51 bra BB0_26; - bra.uni BB0_3; - -BB0_11: - selp.u32 %r73, 1, 0, %p1; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r73, 0; - vote.ballot.b32 %r72, %p1; - } - // inline asm - ld.global.u8 %r20, [%rd2]; - -BB0_12: - mov.u32 %r22, %r94; - cvt.rn.f32.s32 %f43, %r22; - mul.lo.s32 %r24, %r22, %r37; - fma.rn.ftz.f32 %f15, %f37, %f43, %f36; - mov.u32 %r97, %r1; - -BB0_13: - mov.u32 %r25, %r97; - add.s32 %r26, %r25, %r20; - cvt.rn.f32.s32 %f44, %r26; - fma.rn.ftz.f32 %f16, %f35, %f44, %f34; - setp.eq.s32 %p54, %r72, 0; - mov.u32 %r99, 0; - mov.u32 %r100, %r99; - mov.pred %p53, 0; - mov.pred %p91, -1; - mov.pred %p93, %p53; - mov.f32 %f62, %f16; - mov.pred %p101, %p1; - mov.pred %p102, %p1; - mov.f32 %f75, %f15; - @%p54 bra BB0_17; - -BB0_14: - mov.f32 %f71, %f75; - mov.f32 %f76, %f71; - mov.pred %p11, %p102; - mov.f32 %f60, %f62; - mov.f32 %f63, %f60; - mov.pred %p14, %p93; - mul.ftz.f32 %f19, %f76, %f76; - mul.ftz.f32 %f20, %f63, %f63; - add.ftz.f32 %f45, %f19, %f20; - setp.gtu.ftz.f32 %p56, %f45, 0f40800000; - and.pred %p57, %p11, %p56; - or.pred %p15, %p57, %p14; - xor.pred %p58, %p15, %p11; - mov.pred %p92, %p53; - @!%p58 bra BB0_16; - bra.uni BB0_15; - -BB0_15: - and.pred %p59, %p91, %p101; - not.pred %p60, %p15; - add.ftz.f32 %f46, %f63, %f63; - sub.ftz.f32 %f47, %f20, %f19; - fma.rn.ftz.f32 %f76, %f76, %f46, %f15; - add.ftz.f32 %f63, %f16, %f47; - and.pred %p92, %p59, %p60; - -BB0_16: - mov.f32 %f23, %f76; - mov.f32 %f24, %f63; - mov.pred %p91, %p92; - add.s32 %r78, %r100, 1; - selp.b32 %r99, %r78, %r100, %p91; - setp.lt.s32 %p101, %r99, %r40; - and.pred %p102, %p91, %p101; - selp.u32 %r77, 1, 0, %p102; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r77, 0; - vote.ballot.b32 %r76, %p1; - } - // inline asm - setp.ne.s32 %p61, %r76, 0; - mov.pred %p93, %p15; - mov.f32 %f62, %f24; - mov.u32 %r100, %r99; - mov.f32 %f75, %f23; - @%p61 bra BB0_14; - -BB0_17: - add.s32 %r79, %r26, %r24; - shl.b32 %r80, %r79, 2; - cvt.s64.s32 %rd9, %r80; - add.s64 %rd10, %rd9, %rd3; - st.u32 [%rd10], %r99; - add.s32 %r30, %r25, 32; - setp.lt.s32 %p62, %r30, %r5; - mov.u32 %r97, %r30; - @%p62 bra BB0_13; - - setp.ge.s32 %p63, %r30, %r2; - @%p63 bra BB0_25; - - setp.gt.s32 %p66, %r40, 0; - mov.u32 %r102, 0; - add.s32 %r31, %r30, %r20; - setp.lt.s32 %p67, %r31, %r2; - cvt.rn.f32.s32 %f48, %r31; - fma.rn.ftz.f32 %f25, %f35, %f48, %f34; - and.pred %p68, %p66, %p67; - selp.u32 %r82, 1, 0, %p68; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r82, 0; - vote.ballot.b32 %r81, %p1; - } - // inline asm - setp.eq.s32 %p69, %r81, 0; - mov.u32 %r103, %r102; - mov.pred %p65, 0; - mov.pred %p107, -1; - mov.pred %p99, %p1; - mov.pred %p100, %p1; - mov.pred %p109, %p65; - mov.f32 %f66, %f25; - mov.f32 %f73, %f15; - @%p69 bra BB0_23; - -BB0_20: - mov.f32 %f69, %f73; - mov.f32 %f74, %f69; - mov.f32 %f64, %f66; - mov.f32 %f67, %f64; - mov.pred %p23, %p109; - mov.pred %p20, %p100; - mul.ftz.f32 %f28, %f74, %f74; - mul.ftz.f32 %f29, %f67, %f67; - add.ftz.f32 %f49, %f28, %f29; - setp.gtu.ftz.f32 %p71, %f49, 0f40800000; - and.pred %p72, %p20, %p71; - or.pred %p24, %p72, %p23; - setp.ge.s32 %p73, %r31, %r2; - xor.pred %p74, %p24, %p20; - not.pred %p75, %p74; - or.pred %p76, %p75, %p73; - mov.pred %p108, %p65; - @%p76 bra BB0_22; - - and.pred %p77, %p107, %p99; - not.pred %p78, %p24; - add.ftz.f32 %f50, %f67, %f67; - sub.ftz.f32 %f51, %f29, %f28; - fma.rn.ftz.f32 %f74, %f74, %f50, %f15; - add.ftz.f32 %f67, %f25, %f51; - and.pred %p108, %p77, %p78; - -BB0_22: - mov.f32 %f73, %f74; - mov.f32 %f33, %f67; - mov.pred %p107, %p108; - add.s32 %r87, %r103, 1; - selp.b32 %r102, %r87, %r103, %p107; - setp.lt.s32 %p99, %r102, %r40; - and.pred %p100, %p107, %p99; - and.pred %p80, %p100, %p67; - selp.u32 %r86, 1, 0, %p80; - // inline asm - { .reg .pred %p1; - setp.ne.u32 %p1, %r86, 0; - vote.ballot.b32 %r85, %p1; - } - // inline asm - setp.ne.s32 %p81, %r85, 0; - mov.pred %p109, %p24; - mov.f32 %f66, %f33; - mov.u32 %r103, %r102; - @%p81 bra BB0_20; - -BB0_23: - setp.ge.s32 %p82, %r31, %r2; - @%p82 bra BB0_25; - - add.s32 %r88, %r31, %r24; - shl.b32 %r89, %r88, 2; - cvt.s64.s32 %rd11, %r89; - add.s64 %rd12, %rd11, %rd3; - st.u32 [%rd12], %r102; - -BB0_25: - add.s32 %r96, %r96, 1; - add.s32 %r94, %r96, %r6; - setp.ne.s32 %p83, %r96, %r10; - @%p83 bra BB0_12; - -BB0_26: - ret; -} - -.visible .func PTXmandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6, - .param .b64 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7, - .param .b32 mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_8 -) -{ - .reg .pred %p<4>; - .reg .f32 %f<13>; - .reg .s32 %r<27>; - .reg .s64 %rd<13>; - - - ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0]; - ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1]; - ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2]; - ld.param.f32 %f4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3]; - ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4]; - ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5]; - ld.param.u32 %r4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6]; - ld.param.u64 %rd5, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7]; - mov.u32 %r5, %tid.x; - and.b32 %r1, %r5, 31; - setp.ne.s32 %p1, %r1, 0; - mov.u64 %rd12, 0; - @%p1 bra BB1_3; - - mov.u64 %rd7, 8; - mov.u64 %rd8, 48; - // Callseq Start 0 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd7; - .param .b64 param1; - st.param.b64 [param1+0], %rd8; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd1, [retval0+0]; - } - // Callseq End 0 - setp.eq.s64 %p2, %rd1, 0; - mov.u64 %rd12, %rd1; - @%p2 bra BB1_3; - - cvt.rn.f32.s32 %f5, %r2; - rcp.approx.ftz.f32 %f6, %f5; - cvt.rn.f32.s32 %f7, %r3; - rcp.approx.ftz.f32 %f8, %f7; - sub.ftz.f32 %f9, %f4, %f2; - mul.ftz.f32 %f10, %f9, %f8; - sub.ftz.f32 %f11, %f3, %f1; - mul.ftz.f32 %f12, %f11, %f6; - st.f32 [%rd1], %f1; - st.f32 [%rd1+4], %f12; - st.f32 [%rd1+8], %f2; - st.f32 [%rd1+12], %f10; - st.u32 [%rd1+16], %r2; - st.u32 [%rd1+20], %r3; - mov.u32 %r6, 64; - st.u32 [%rd1+24], %r6; - mov.u32 %r7, 8; - st.u32 [%rd1+28], %r7; - st.u32 [%rd1+32], %r4; - st.u64 [%rd1+40], %rd5; - mov.u64 %rd12, %rd1; - -BB1_3: - @%p1 bra BB1_5; - - shr.s32 %r16, %r2, 31; - shr.u32 %r17, %r16, 26; - add.s32 %r18, %r2, %r17; - shr.s32 %r19, %r18, 6; - shr.s32 %r20, %r3, 31; - shr.u32 %r21, %r20, 29; - add.s32 %r22, %r3, %r21; - shr.s32 %r10, %r22, 3; - add.s32 %r23, %r19, -1; - shr.s32 %r24, %r23, 2; - add.s32 %r9, %r24, 1; - mov.u32 %r14, 1; - mov.u32 %r12, 128; - mov.u32 %r15, 0; - mov.u64 %rd11, 0; - mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd12; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r9; - st.param.b32 [param2+4], %r10; - st.param.b32 [param2+8], %r14; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r12; - st.param.b32 [param3+4], %r14; - st.param.b32 [param3+8], %r14; - .param .b32 param4; - st.param.b32 [param4+0], %r15; - .param .b64 param5; - st.param.b64 [param5+0], %rd11; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r8, [retval0+0]; - } - - // inline asm - -BB1_5: - // Callseq Start 1 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r25, [retval0+0]; - } - // Callseq End 1 - // Callseq Start 2 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r26, [retval0+0]; - } - // Callseq End 2 - ret; -} - -.visible .func PTXmandelbrot_ispc( - .param .b32 mandelbrot_ispc_param_0, - .param .b32 mandelbrot_ispc_param_1, - .param .b32 mandelbrot_ispc_param_2, - .param .b32 mandelbrot_ispc_param_3, - .param .b32 mandelbrot_ispc_param_4, - .param .b32 mandelbrot_ispc_param_5, - .param .b32 mandelbrot_ispc_param_6, - .param .b64 mandelbrot_ispc_param_7 -) -{ - .reg .pred %p<4>; - .reg .f32 %f<13>; - .reg .s32 %r<27>; - .reg .s64 %rd<13>; - - - ld.param.f32 %f1, [mandelbrot_ispc_param_0]; - ld.param.f32 %f2, [mandelbrot_ispc_param_1]; - ld.param.f32 %f3, [mandelbrot_ispc_param_2]; - ld.param.f32 %f4, [mandelbrot_ispc_param_3]; - ld.param.u32 %r2, [mandelbrot_ispc_param_4]; - ld.param.u32 %r3, [mandelbrot_ispc_param_5]; - ld.param.u32 %r4, [mandelbrot_ispc_param_6]; - ld.param.u64 %rd5, [mandelbrot_ispc_param_7]; - mov.u32 %r5, %tid.x; - and.b32 %r1, %r5, 31; - setp.ne.s32 %p1, %r1, 0; - mov.u64 %rd12, 0; - @%p1 bra BB2_3; - - mov.u64 %rd7, 8; - mov.u64 %rd8, 48; - // Callseq Start 3 - { - .reg .b32 temp_param_reg; - .param .b64 param0; - st.param.b64 [param0+0], %rd7; - .param .b64 param1; - st.param.b64 [param1+0], %rd8; - .param .b64 retval0; - call.uni (retval0), - cudaGetParameterBuffer, - ( - param0, - param1 - ); - ld.param.b64 %rd1, [retval0+0]; - } - // Callseq End 3 - setp.eq.s64 %p2, %rd1, 0; - mov.u64 %rd12, %rd1; - @%p2 bra BB2_3; - - cvt.rn.f32.s32 %f5, %r2; - rcp.approx.ftz.f32 %f6, %f5; - cvt.rn.f32.s32 %f7, %r3; - rcp.approx.ftz.f32 %f8, %f7; - sub.ftz.f32 %f9, %f4, %f2; - mul.ftz.f32 %f10, %f9, %f8; - sub.ftz.f32 %f11, %f3, %f1; - mul.ftz.f32 %f12, %f11, %f6; - st.f32 [%rd1], %f1; - st.f32 [%rd1+4], %f12; - st.f32 [%rd1+8], %f2; - st.f32 [%rd1+12], %f10; - st.u32 [%rd1+16], %r2; - st.u32 [%rd1+20], %r3; - mov.u32 %r6, 64; - st.u32 [%rd1+24], %r6; - mov.u32 %r7, 8; - st.u32 [%rd1+28], %r7; - st.u32 [%rd1+32], %r4; - st.u64 [%rd1+40], %rd5; - mov.u64 %rd12, %rd1; - -BB2_3: - @%p1 bra BB2_5; - - shr.s32 %r16, %r2, 31; - shr.u32 %r17, %r16, 26; - add.s32 %r18, %r2, %r17; - shr.s32 %r19, %r18, 6; - shr.s32 %r20, %r3, 31; - shr.u32 %r21, %r20, 29; - add.s32 %r22, %r3, %r21; - shr.s32 %r10, %r22, 3; - add.s32 %r23, %r19, -1; - shr.s32 %r24, %r23, 2; - add.s32 %r9, %r24, 1; - mov.u32 %r14, 1; - mov.u32 %r12, 128; - mov.u32 %r15, 0; - mov.u64 %rd11, 0; - mov.u64 %rd9, PTXmandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; - // inline asm - { - .param .b64 param0; - st.param.b64 [param0+0], %rd9; - .param .b64 param1; - st.param.b64 [param1+0], %rd12; - .param .align 4 .b8 param2[12]; - st.param.b32 [param2+0], %r9; - st.param.b32 [param2+4], %r10; - st.param.b32 [param2+8], %r14; - .param .align 4 .b8 param3[12]; - st.param.b32 [param3+0], %r12; - st.param.b32 [param3+4], %r14; - st.param.b32 [param3+8], %r14; - .param .b32 param4; - st.param.b32 [param4+0], %r15; - .param .b64 param5; - st.param.b64 [param5+0], %rd11; - - .param .b32 retval0; - call.uni (retval0), - cudaLaunchDevice, - ( - param0, - param1, - param2, - param3, - param4, - param5 - ); - ld.param.b32 %r8, [retval0+0]; - } - - // inline asm - -BB2_5: - // Callseq Start 4 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r25, [retval0+0]; - } - // Callseq End 4 - // Callseq Start 5 - { - .reg .b32 temp_param_reg; - .param .b32 retval0; - call.uni (retval0), - cudaDeviceSynchronize, - ( - ); - ld.param.b32 %r26, [retval0+0]; - } - // Callseq End 5 - ret; -} - - - diff --git a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp index 73d3aa0f..ac130604 100644 --- a/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp +++ b/examples_cuda/mandelbrot_tasks3d/mandel_cu.cpp @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) { mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, (int*)d_buf); double dt = 1e3*(rtc() - t0); //get_elapsed_mcycles(); #else - const char * func_name = "mandelbrot_ispc"; + const char * func_name = "mandelbrot_ispc__export"; void *func_args[] = {&x0, &y0, &x1, &y1, &width, &height, &maxIterations, &d_buf}; const double dt = 1e3*CUDALaunch(NULL, func_name, func_args); #endif diff --git a/examples_cuda/mandelbrot_tasks3d/test/__header.ptx b/examples_cuda/mandelbrot_tasks3d/test/__header.ptx deleted file mode 100644 index 8b3c4acb..00000000 --- a/examples_cuda/mandelbrot_tasks3d/test/__header.ptx +++ /dev/null @@ -1,36 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// Compiler built on Tue Nov 5 22:34:47 2013 (1383687287) -// Cuda compilation tools, release 6.0, V6.0.1 -// - -.version 4.0 -.target sm_35 -.address_size 64 - - -.extern .func (.param .b32 func_retval0) cudaLaunchDevice -( - .param .b64 cudaLaunchDevice_param_0, - .param .b64 cudaLaunchDevice_param_1, - .param .align 4 .b8 cudaLaunchDevice_param_2[12], - .param .align 4 .b8 cudaLaunchDevice_param_3[12], - .param .b32 cudaLaunchDevice_param_4, - .param .b64 cudaLaunchDevice_param_5 -); - - -.extern .func (.param .b64 func_retval0) cudaGetParameterBuffer -( - .param .b64 cudaGetParameterBuffer_param_0, - .param .b64 cudaGetParameterBuffer_param_1 -) -; -.extern .func (.param .b32 func_retval0) cudaDeviceSynchronize -( - -) -; -.global .align 1 .b8 constDeltaForeach[32]; -.global .align 1 .b8 constDeltaForeach3[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; - diff --git a/examples_cuda/mandelbrot_tasks3d/test/compile.sh b/examples_cuda/mandelbrot_tasks3d/test/compile.sh deleted file mode 100755 index 12f00cbc..00000000 --- a/examples_cuda/mandelbrot_tasks3d/test/compile.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -nvcc -arch=sm_35 -dc kernel_ptx.cu -dryrun -Xptxas=-v 2>&1 | \ - sed 's/\#\$//g'| \ - awk '{if ($1=="cicc") {print $0; print "grep -ve \"\\.version\" -e \"\\.target\" -e \"\\.address_size\" ", $NF, " > __body.ptx"; print "cat __header.ptx __body.ptx >", $NF} else print $0}' > run1.sh diff --git a/examples_cuda/mandelbrot_tasks3d/test/kernel_ptx.cu b/examples_cuda/mandelbrot_tasks3d/test/kernel_ptx.cu deleted file mode 100644 index 625b6330..00000000 --- a/examples_cuda/mandelbrot_tasks3d/test/kernel_ptx.cu +++ /dev/null @@ -1,717 +0,0 @@ -#define __b8 char -#define __f32 float -#define __u32 unsigned int -#define __b32 unsigned int -#define __u64 unsigned long long -#define __b64 unsigned long long -#define __global __device__ -#define __visible -#define __entry __global__ void -#define __func __device__ -#define __align(x) - -extern "C" __visible __entry mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_( - __f32 param_0, - __f32 param_1, - __f32 param_2, - __f32 param_3, - __u32 param_4, - __u32 param_5, - __u32 param_6, - __u32 param_7, - __u32 param_8, - __u64 param_9 -) -{ - asm( -" .reg .pred %p<110>; \n\t" -" .reg .f32 %f<77>; \n\t" -" .reg .s32 %r<104>; \n\t" -" .reg .s64 %rd<13>; \n\t" -" \n\t" -" \n\t" -" ld.param.f32 %f34, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_0]; \n\t" -" ld.param.f32 %f35, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_1]; \n\t" -" ld.param.f32 %f36, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_2]; \n\t" -" ld.param.f32 %f37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_3]; \n\t" -" ld.param.u32 %r37, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_4]; \n\t" -" ld.param.u32 %r38, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_5]; \n\t" -" ld.param.u32 %r41, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_6]; \n\t" -" ld.param.u32 %r39, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_7]; \n\t" -" ld.param.u32 %r40, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_8]; \n\t" -" ld.param.u64 %rd3, [mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E__param_9]; \n\t" -" mov.u32 %r42, %ctaid.x; \n\t" -" shl.b32 %r43, %r42, 2; \n\t" -" mov.u32 %r44, %tid.x; \n\t" -" shr.s32 %r45, %r44, 5; \n\t" -" add.s32 %r46, %r45, %r43; \n\t" -" mul.lo.s32 %r1, %r46, %r41; \n\t" -" add.s32 %r47, %r1, %r41; \n\t" -" min.s32 %r2, %r47, %r37; \n\t" -" mov.u32 %r3, %ctaid.y; \n\t" -" mul.lo.s32 %r96, %r3, %r39; \n\t" -" add.s32 %r48, %r96, %r39; \n\t" -" min.s32 %r49, %r48, %r38; \n\t" -" sub.s32 %r50, %r2, %r1; \n\t" -" shr.s32 %r51, %r50, 31; \n\t" -" shr.u32 %r52, %r51, 27; \n\t" -" add.s32 %r53, %r50, %r52; \n\t" -" and.b32 %r54, %r53, -32; \n\t" -" sub.s32 %r55, %r50, %r54; \n\t" -" sub.s32 %r5, %r2, %r55; \n\t" -" and.b32 %r56, %r44, 31; \n\t" -" cvt.u64.u32 %rd1, %r56; \n\t" -" mov.u64 %rd4, constDeltaForeach; \n\t" -" add.s64 %rd5, %rd4, %rd1; \n\t" -" ld.global.u8 %r6, [%rd5]; \n\t" -" add.s32 %r94, %r96, %r6; \n\t" -" setp.ge.s32 %p29, %r96, %r49; \n\t" -" @%p29 bra BB0_26; \n\t" -" \n\t" -" setp.lt.s32 %p30, %r1, %r5; \n\t" -" setp.gt.s32 %p1, %r40, 0; \n\t" -" add.s32 %r57, %r3, 1; \n\t" -" mul.lo.s32 %r58, %r57, %r39; \n\t" -" not.b32 %r59, %r58; \n\t" -" not.b32 %r60, %r38; \n\t" -" max.s32 %r61, %r60, %r59; \n\t" -" not.b32 %r10, %r61; \n\t" -" mov.u64 %rd6, constDeltaForeach3; \n\t" -" add.s64 %rd2, %rd6, %rd1; \n\t" -" @%p30 bra BB0_11; \n\t" -" \n\t" -" mov.u32 %r95, %r94; \n\t" -" \n\t" -" BB0_3: \n\t" -" setp.ge.s32 %p31, %r1, %r2; \n\t" -" @%p31 bra BB0_10; \n\t" -" \n\t" -" setp.gt.s32 %p34, %r40, 0; \n\t" -" mov.u32 %r91, 0; \n\t" -" ld.global.u8 %r66, [%rd2]; \n\t" -" add.s32 %r14, %r1, %r66; \n\t" -" setp.lt.s32 %p35, %r14, %r2; \n\t" -" cvt.rn.f32.s32 %f38, %r14; \n\t" -" fma.rn.f32 %f5, %f35, %f38, %f34; \n\t" -" cvt.rn.f32.s32 %f39, %r95; \n\t" -" fma.rn.f32 %f6, %f37, %f39, %f36; \n\t" -" and.pred %p36, %p34, %p35; \n\t" -" selp.u32 %r63, 1, 0, %p36; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r63, 0; \n\t" -" vote.ballot.b32 %r62, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" setp.eq.s32 %p37, %r62, 0; \n\t" -" mov.u32 %r92, %r91; \n\t" -" mov.pred %p33, 0; \n\t" -" mov.pred %p86, -1; \n\t" -" mov.pred %p88, %p33; \n\t" -" mov.f32 %f54, %f5; \n\t" -" mov.f32 %f58, %f6; \n\t" -" mov.pred %p103, %p1; \n\t" -" mov.pred %p104, %p1; \n\t" -" @%p37 bra BB0_8; \n\t" -" \n\t" -" BB0_5: \n\t" -" mov.pred %p2, %p104; \n\t" -" mov.f32 %f56, %f58; \n\t" -" mov.f32 %f59, %f56; \n\t" -" mov.f32 %f52, %f54; \n\t" -" mov.f32 %f55, %f52; \n\t" -" mov.pred %p5, %p88; \n\t" -" mul.f32 %f9, %f59, %f59; \n\t" -" mul.f32 %f10, %f55, %f55; \n\t" -" add.f32 %f40, %f9, %f10; \n\t" -" setp.gtu.f32 %p39, %f40, 0f40800000; \n\t" -" and.pred %p40, %p2, %p39; \n\t" -" or.pred %p6, %p40, %p5; \n\t" -" setp.ge.s32 %p41, %r14, %r2; \n\t" -" xor.pred %p42, %p6, %p2; \n\t" -" not.pred %p43, %p42; \n\t" -" or.pred %p44, %p43, %p41; \n\t" -" mov.pred %p87, %p33; \n\t" -" @%p44 bra BB0_7; \n\t" -" \n\t" -" and.pred %p45, %p86, %p103; \n\t" -" not.pred %p46, %p6; \n\t" -" add.f32 %f41, %f55, %f55; \n\t" -" sub.f32 %f42, %f10, %f9; \n\t" -" fma.rn.f32 %f59, %f59, %f41, %f6; \n\t" -" add.f32 %f55, %f5, %f42; \n\t" -" and.pred %p87, %p45, %p46; \n\t" -" \n\t" -" BB0_7: \n\t" -" mov.f32 %f13, %f59; \n\t" -" mov.f32 %f14, %f55; \n\t" -" mov.pred %p86, %p87; \n\t" -" add.s32 %r69, %r92, 1; \n\t" -" selp.b32 %r91, %r69, %r92, %p86; \n\t" -" setp.lt.s32 %p103, %r91, %r40; \n\t" -" and.pred %p10, %p86, %p103; \n\t" -" and.pred %p48, %p10, %p35; \n\t" -" selp.u32 %r68, 1, 0, %p48; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r68, 0; \n\t" -" vote.ballot.b32 %r67, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" setp.ne.s32 %p49, %r67, 0; \n\t" -" mov.pred %p88, %p6; \n\t" -" mov.f32 %f54, %f14; \n\t" -" mov.f32 %f58, %f13; \n\t" -" mov.u32 %r92, %r91; \n\t" -" mov.pred %p104, %p10; \n\t" -" @%p49 bra BB0_5; \n\t" -" \n\t" -" BB0_8: \n\t" -" setp.ge.s32 %p50, %r14, %r2; \n\t" -" @%p50 bra BB0_10; \n\t" -" \n\t" -" mad.lo.s32 %r70, %r95, %r37, %r14; \n\t" -" shl.b32 %r71, %r70, 2; \n\t" -" cvt.s64.s32 %rd7, %r71; \n\t" -" add.s64 %rd8, %rd7, %rd3; \n\t" -" st.u32 [%rd8], %r91; \n\t" -" \n\t" -" BB0_10: \n\t" -" add.s32 %r96, %r96, 1; \n\t" -" add.s32 %r95, %r96, %r6; \n\t" -" setp.eq.s32 %p51, %r96, %r10; \n\t" -" @%p51 bra BB0_26; \n\t" -" bra.uni BB0_3; \n\t" -" \n\t" -" BB0_11: \n\t" -" selp.u32 %r73, 1, 0, %p1; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r73, 0; \n\t" -" vote.ballot.b32 %r72, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" ld.global.u8 %r20, [%rd2]; \n\t" -" \n\t" -" BB0_12: \n\t" -" mov.u32 %r22, %r94; \n\t" -" cvt.rn.f32.s32 %f43, %r22; \n\t" -" mul.lo.s32 %r24, %r22, %r37; \n\t" -" fma.rn.f32 %f15, %f37, %f43, %f36; \n\t" -" mov.u32 %r97, %r1; \n\t" -" \n\t" -" BB0_13: \n\t" -" mov.u32 %r25, %r97; \n\t" -" add.s32 %r26, %r25, %r20; \n\t" -" cvt.rn.f32.s32 %f44, %r26; \n\t" -" fma.rn.f32 %f16, %f35, %f44, %f34; \n\t" -" setp.eq.s32 %p54, %r72, 0; \n\t" -" mov.u32 %r99, 0; \n\t" -" mov.u32 %r100, %r99; \n\t" -" mov.pred %p53, 0; \n\t" -" mov.pred %p91, -1; \n\t" -" mov.pred %p93, %p53; \n\t" -" mov.f32 %f62, %f16; \n\t" -" mov.pred %p101, %p1; \n\t" -" mov.pred %p102, %p1; \n\t" -" mov.f32 %f75, %f15; \n\t" -" @%p54 bra BB0_17; \n\t" -" \n\t" -" BB0_14: \n\t" -" mov.f32 %f71, %f75; \n\t" -" mov.f32 %f76, %f71; \n\t" -" mov.pred %p11, %p102; \n\t" -" mov.f32 %f60, %f62; \n\t" -" mov.f32 %f63, %f60; \n\t" -" mov.pred %p14, %p93; \n\t" -" mul.f32 %f19, %f76, %f76; \n\t" -" mul.f32 %f20, %f63, %f63; \n\t" -" add.f32 %f45, %f19, %f20; \n\t" -" setp.gtu.f32 %p56, %f45, 0f40800000; \n\t" -" and.pred %p57, %p11, %p56; \n\t" -" or.pred %p15, %p57, %p14; \n\t" -" xor.pred %p58, %p15, %p11; \n\t" -" mov.pred %p92, %p53; \n\t" -" @!%p58 bra BB0_16; \n\t" -" bra.uni BB0_15; \n\t" -" \n\t" -" BB0_15: \n\t" -" and.pred %p59, %p91, %p101; \n\t" -" not.pred %p60, %p15; \n\t" -" add.f32 %f46, %f63, %f63; \n\t" -" sub.f32 %f47, %f20, %f19; \n\t" -" fma.rn.f32 %f76, %f76, %f46, %f15; \n\t" -" add.f32 %f63, %f16, %f47; \n\t" -" and.pred %p92, %p59, %p60; \n\t" -" \n\t" -" BB0_16: \n\t" -" mov.f32 %f23, %f76; \n\t" -" mov.f32 %f24, %f63; \n\t" -" mov.pred %p91, %p92; \n\t" -" add.s32 %r78, %r100, 1; \n\t" -" selp.b32 %r99, %r78, %r100, %p91; \n\t" -" setp.lt.s32 %p101, %r99, %r40; \n\t" -" and.pred %p102, %p91, %p101; \n\t" -" selp.u32 %r77, 1, 0, %p102; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r77, 0; \n\t" -" vote.ballot.b32 %r76, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" setp.ne.s32 %p61, %r76, 0; \n\t" -" mov.pred %p93, %p15; \n\t" -" mov.f32 %f62, %f24; \n\t" -" mov.u32 %r100, %r99; \n\t" -" mov.f32 %f75, %f23; \n\t" -" @%p61 bra BB0_14; \n\t" -" \n\t" -" BB0_17: \n\t" -" add.s32 %r79, %r26, %r24; \n\t" -" shl.b32 %r80, %r79, 2; \n\t" -" cvt.s64.s32 %rd9, %r80; \n\t" -" add.s64 %rd10, %rd9, %rd3; \n\t" -" st.u32 [%rd10], %r99; \n\t" -" add.s32 %r30, %r25, 32; \n\t" -" setp.lt.s32 %p62, %r30, %r5; \n\t" -" mov.u32 %r97, %r30; \n\t" -" @%p62 bra BB0_13; \n\t" -" \n\t" -" setp.ge.s32 %p63, %r30, %r2; \n\t" -" @%p63 bra BB0_25; \n\t" -" \n\t" -" setp.gt.s32 %p66, %r40, 0; \n\t" -" mov.u32 %r102, 0; \n\t" -" add.s32 %r31, %r30, %r20; \n\t" -" setp.lt.s32 %p67, %r31, %r2; \n\t" -" cvt.rn.f32.s32 %f48, %r31; \n\t" -" fma.rn.f32 %f25, %f35, %f48, %f34; \n\t" -" and.pred %p68, %p66, %p67; \n\t" -" selp.u32 %r82, 1, 0, %p68; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r82, 0; \n\t" -" vote.ballot.b32 %r81, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" setp.eq.s32 %p69, %r81, 0; \n\t" -" mov.u32 %r103, %r102; \n\t" -" mov.pred %p65, 0; \n\t" -" mov.pred %p107, -1; \n\t" -" mov.pred %p99, %p1; \n\t" -" mov.pred %p100, %p1; \n\t" -" mov.pred %p109, %p65; \n\t" -" mov.f32 %f66, %f25; \n\t" -" mov.f32 %f73, %f15; \n\t" -" @%p69 bra BB0_23; \n\t" -" \n\t" -" BB0_20: \n\t" -" mov.f32 %f69, %f73; \n\t" -" mov.f32 %f74, %f69; \n\t" -" mov.f32 %f64, %f66; \n\t" -" mov.f32 %f67, %f64; \n\t" -" mov.pred %p23, %p109; \n\t" -" mov.pred %p20, %p100; \n\t" -" mul.f32 %f28, %f74, %f74; \n\t" -" mul.f32 %f29, %f67, %f67; \n\t" -" add.f32 %f49, %f28, %f29; \n\t" -" setp.gtu.f32 %p71, %f49, 0f40800000; \n\t" -" and.pred %p72, %p20, %p71; \n\t" -" or.pred %p24, %p72, %p23; \n\t" -" setp.ge.s32 %p73, %r31, %r2; \n\t" -" xor.pred %p74, %p24, %p20; \n\t" -" not.pred %p75, %p74; \n\t" -" or.pred %p76, %p75, %p73; \n\t" -" mov.pred %p108, %p65; \n\t" -" @%p76 bra BB0_22; \n\t" -" \n\t" -" and.pred %p77, %p107, %p99; \n\t" -" not.pred %p78, %p24; \n\t" -" add.f32 %f50, %f67, %f67; \n\t" -" sub.f32 %f51, %f29, %f28; \n\t" -" fma.rn.f32 %f74, %f74, %f50, %f15; \n\t" -" add.f32 %f67, %f25, %f51; \n\t" -" and.pred %p108, %p77, %p78; \n\t" -" \n\t" -" BB0_22: \n\t" -" mov.f32 %f73, %f74; \n\t" -" mov.f32 %f33, %f67; \n\t" -" mov.pred %p107, %p108; \n\t" -" add.s32 %r87, %r103, 1; \n\t" -" selp.b32 %r102, %r87, %r103, %p107; \n\t" -" setp.lt.s32 %p99, %r102, %r40; \n\t" -" and.pred %p100, %p107, %p99; \n\t" -" and.pred %p80, %p100, %p67; \n\t" -" selp.u32 %r86, 1, 0, %p80; \n\t" -" // inline asm \n\t" -" { .reg .pred %p1; \n\t" -" setp.ne.u32 %p1, %r86, 0; \n\t" -" vote.ballot.b32 %r85, %p1; \n\t" -" } \n\t" -" // inline asm \n\t" -" setp.ne.s32 %p81, %r85, 0; \n\t" -" mov.pred %p109, %p24; \n\t" -" mov.f32 %f66, %f33; \n\t" -" mov.u32 %r103, %r102; \n\t" -" @%p81 bra BB0_20; \n\t" -" \n\t" -" BB0_23: \n\t" -" setp.ge.s32 %p82, %r31, %r2; \n\t" -" @%p82 bra BB0_25; \n\t" -" \n\t" -" add.s32 %r88, %r31, %r24; \n\t" -" shl.b32 %r89, %r88, 2; \n\t" -" cvt.s64.s32 %rd11, %r89; \n\t" -" add.s64 %rd12, %rd11, %rd3; \n\t" -" st.u32 [%rd12], %r102; \n\t" -" \n\t" -" BB0_25: \n\t" -" add.s32 %r96, %r96, 1; \n\t" -" add.s32 %r94, %r96, %r6; \n\t" -" setp.ne.s32 %p83, %r96, %r10; \n\t" -" @%p83 bra BB0_12; \n\t" -" \n\t" -" BB0_26: \n\t" -" ret; \n\t" -); -} - -extern "C" __visible __entry mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E_( - __f32 param_0, - __f32 param_1, - __f32 param_2, - __f32 param_3, - __u32 param_4, - __u32 param_5, - __u32 param_6, - __u64 param_7, - __b8 param_8 - ) -{ - asm( -" .reg .pred %p<4>; \n\t" -" .reg .f32 %f<13>; \n\t" -" .reg .s32 %r<27>; \n\t" -" .reg .s64 %rd<13>; \n\t" -" \n\t" -" \n\t" -" ld.param.f32 %f1, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_0]; \n\t" -" ld.param.f32 %f2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_1]; \n\t" -" ld.param.f32 %f3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_2]; \n\t" -" ld.param.f32 %f4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_3]; \n\t" -" ld.param.u32 %r2, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_4]; \n\t" -" ld.param.u32 %r3, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_5]; \n\t" -" ld.param.u32 %r4, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_6]; \n\t" -" ld.param.u64 %rd5, [mandelbrot_ispc___unfunfunfunfuniuniuniun_3C_uni_3E__param_7]; \n\t" -" mov.u32 %r5, %tid.x; \n\t" -" and.b32 %r1, %r5, 31; \n\t" -" setp.ne.s32 %p1, %r1, 0; \n\t" -" mov.u64 %rd12, 0; \n\t" -" @%p1 bra BB1_3; \n\t" -" \n\t" -" mov.u64 %rd7, 8; \n\t" -" mov.u64 %rd8, 48; \n\t" -" // Callseq Start 0 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b64 param0; \n\t" -" st.param.b64 [param0+0], %rd7; \n\t" -" .param .b64 param1; \n\t" -" st.param.b64 [param1+0], %rd8; \n\t" -" .param .b64 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaGetParameterBuffer, \n\t" -" ( \n\t" -" param0, \n\t" -" param1 \n\t" -" ); \n\t" -" ld.param.b64 %rd1, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 0 \n\t" -" setp.eq.s64 %p2, %rd1, 0; \n\t" -" mov.u64 %rd12, %rd1; \n\t" -" @%p2 bra BB1_3; \n\t" -" \n\t" -" cvt.rn.f32.s32 %f5, %r2; \n\t" -" rcp.rn.f32 %f6, %f5; \n\t" -" cvt.rn.f32.s32 %f7, %r3; \n\t" -" rcp.rn.f32 %f8, %f7; \n\t" -" sub.f32 %f9, %f4, %f2; \n\t" -" mul.f32 %f10, %f9, %f8; \n\t" -" sub.f32 %f11, %f3, %f1; \n\t" -" mul.f32 %f12, %f11, %f6; \n\t" -" st.f32 [%rd1], %f1; \n\t" -" st.f32 [%rd1+4], %f12; \n\t" -" st.f32 [%rd1+8], %f2; \n\t" -" st.f32 [%rd1+12], %f10; \n\t" -" st.u32 [%rd1+16], %r2; \n\t" -" st.u32 [%rd1+20], %r3; \n\t" -" mov.u32 %r6, 64; \n\t" -" st.u32 [%rd1+24], %r6; \n\t" -" mov.u32 %r7, 8; \n\t" -" st.u32 [%rd1+28], %r7; \n\t" -" st.u32 [%rd1+32], %r4; \n\t" -" st.u64 [%rd1+40], %rd5; \n\t" -" mov.u64 %rd12, %rd1; \n\t" -" \n\t" -" BB1_3: \n\t" -" @%p1 bra BB1_5; \n\t" -" \n\t" -" shr.s32 %r16, %r2, 31; \n\t" -" shr.u32 %r17, %r16, 26; \n\t" -" add.s32 %r18, %r2, %r17; \n\t" -" shr.s32 %r19, %r18, 6; \n\t" -" shr.s32 %r20, %r3, 31; \n\t" -" shr.u32 %r21, %r20, 29; \n\t" -" add.s32 %r22, %r3, %r21; \n\t" -" shr.s32 %r10, %r22, 3; \n\t" -" add.s32 %r23, %r19, -1; \n\t" -" shr.s32 %r24, %r23, 2; \n\t" -" add.s32 %r9, %r24, 1; \n\t" -" mov.u32 %r14, 1; \n\t" -" mov.u32 %r12, 128; \n\t" -" mov.u32 %r15, 0; \n\t" -" mov.u64 %rd11, 0; \n\t" -" mov.u64 %rd9, mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; \n\t" -" // inline asm \n\t" -" { \n\t" -" .param .b64 param0; \n\t" -" st.param.b64 [param0+0], %rd9; \n\t" -" .param .b64 param1; \n\t" -" st.param.b64 [param1+0], %rd12; \n\t" -" .param .align 4 .b8 param2[12]; \n\t" -" st.param.b32 [param2+0], %r9; \n\t" -" st.param.b32 [param2+4], %r10; \n\t" -" st.param.b32 [param2+8], %r14; \n\t" -" .param .align 4 .b8 param3[12]; \n\t" -" st.param.b32 [param3+0], %r12; \n\t" -" st.param.b32 [param3+4], %r14; \n\t" -" st.param.b32 [param3+8], %r14; \n\t" -" .param .b32 param4; \n\t" -" st.param.b32 [param4+0], %r15; \n\t" -" .param .b64 param5; \n\t" -" st.param.b64 [param5+0], %rd11; \n\t" -" \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaLaunchDevice, \n\t" -" ( \n\t" -" param0, \n\t" -" param1, \n\t" -" param2, \n\t" -" param3, \n\t" -" param4, \n\t" -" param5 \n\t" -" ); \n\t" -" ld.param.b32 %r8, [retval0+0]; \n\t" -" } \n\t" -" \n\t" -" // inline asm \n\t" -" \n\t" -" BB1_5: \n\t" -" // Callseq Start 1 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaDeviceSynchronize, \n\t" -" ( \n\t" -" ); \n\t" -" ld.param.b32 %r25, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 1 \n\t" -" // Callseq Start 2 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaDeviceSynchronize, \n\t" -" ( \n\t" -" ); \n\t" -" ld.param.b32 %r26, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 2 \n\t" -" ret; \n\t" -); -} - -extern "C" __visible __entry mandelbrot_ispc__kernel( - __f32 param_0, - __f32 param_1, - __f32 param_2, - __f32 param_3, - __u32 param_4, - __u32 param_5, - __u32 param_6, - __u64 param_7 - ) -{ - asm( -" .reg .pred %p<4>; \n\t" -" .reg .f32 %f<13>; \n\t" -" .reg .s32 %r<27>; \n\t" -" .reg .s64 %rd<13>; \n\t" -" \n\t" -" \n\t" -" ld.param.f32 %f1, [mandelbrot_ispc__kernel_param_0]; \n\t" -" ld.param.f32 %f2, [mandelbrot_ispc__kernel_param_1]; \n\t" -" ld.param.f32 %f3, [mandelbrot_ispc__kernel_param_2]; \n\t" -" ld.param.f32 %f4, [mandelbrot_ispc__kernel_param_3]; \n\t" -" ld.param.u32 %r2, [mandelbrot_ispc__kernel_param_4]; \n\t" -" ld.param.u32 %r3, [mandelbrot_ispc__kernel_param_5]; \n\t" -" ld.param.u32 %r4, [mandelbrot_ispc__kernel_param_6]; \n\t" -" ld.param.u64 %rd5, [mandelbrot_ispc__kernel_param_7]; \n\t" -" mov.u32 %r5, %tid.x; \n\t" -" and.b32 %r1, %r5, 31; \n\t" -" setp.ne.s32 %p1, %r1, 0; \n\t" -" mov.u64 %rd12, 0; \n\t" -" @%p1 bra BB2_3; \n\t" -" \n\t" -" mov.u64 %rd7, 8; \n\t" -" mov.u64 %rd8, 48; \n\t" -" // Callseq Start 3 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b64 param0; \n\t" -" st.param.b64 [param0+0], %rd7; \n\t" -" .param .b64 param1; \n\t" -" st.param.b64 [param1+0], %rd8; \n\t" -" .param .b64 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaGetParameterBuffer, \n\t" -" ( \n\t" -" param0, \n\t" -" param1 \n\t" -" ); \n\t" -" ld.param.b64 %rd1, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 3 \n\t" -" setp.eq.s64 %p2, %rd1, 0; \n\t" -" mov.u64 %rd12, %rd1; \n\t" -" @%p2 bra BB2_3; \n\t" -" \n\t" -" cvt.rn.f32.s32 %f5, %r2; \n\t" -" rcp.rn.f32 %f6, %f5; \n\t" -" cvt.rn.f32.s32 %f7, %r3; \n\t" -" rcp.rn.f32 %f8, %f7; \n\t" -" sub.f32 %f9, %f4, %f2; \n\t" -" mul.f32 %f10, %f9, %f8; \n\t" -" sub.f32 %f11, %f3, %f1; \n\t" -" mul.f32 %f12, %f11, %f6; \n\t" -" st.f32 [%rd1], %f1; \n\t" -" st.f32 [%rd1+4], %f12; \n\t" -" st.f32 [%rd1+8], %f2; \n\t" -" st.f32 [%rd1+12], %f10; \n\t" -" st.u32 [%rd1+16], %r2; \n\t" -" st.u32 [%rd1+20], %r3; \n\t" -" mov.u32 %r6, 64; \n\t" -" st.u32 [%rd1+24], %r6; \n\t" -" mov.u32 %r7, 8; \n\t" -" st.u32 [%rd1+28], %r7; \n\t" -" st.u32 [%rd1+32], %r4; \n\t" -" st.u64 [%rd1+40], %rd5; \n\t" -" mov.u64 %rd12, %rd1; \n\t" -" \n\t" -" BB2_3: \n\t" -" @%p1 bra BB2_5; \n\t" -" \n\t" -" shr.s32 %r16, %r2, 31; \n\t" -" shr.u32 %r17, %r16, 26; \n\t" -" add.s32 %r18, %r2, %r17; \n\t" -" shr.s32 %r19, %r18, 6; \n\t" -" shr.s32 %r20, %r3, 31; \n\t" -" shr.u32 %r21, %r20, 29; \n\t" -" add.s32 %r22, %r3, %r21; \n\t" -" shr.s32 %r10, %r22, 3; \n\t" -" add.s32 %r23, %r19, -1; \n\t" -" shr.s32 %r24, %r23, 2; \n\t" -" add.s32 %r9, %r24, 1; \n\t" -" mov.u32 %r14, 1; \n\t" -" mov.u32 %r12, 128; \n\t" -" mov.u32 %r15, 0; \n\t" -" mov.u64 %rd11, 0; \n\t" -" mov.u64 %rd9, mandelbrot_scanline___UM_unfunfunfunfuniuniuniuniuniun_3C_uni_3E_; \n\t" -" // inline asm \n\t" -" { \n\t" -" .param .b64 param0; \n\t" -" st.param.b64 [param0+0], %rd9; \n\t" -" .param .b64 param1; \n\t" -" st.param.b64 [param1+0], %rd12; \n\t" -" .param .align 4 .b8 param2[12]; \n\t" -" st.param.b32 [param2+0], %r9; \n\t" -" st.param.b32 [param2+4], %r10; \n\t" -" st.param.b32 [param2+8], %r14; \n\t" -" .param .align 4 .b8 param3[12]; \n\t" -" st.param.b32 [param3+0], %r12; \n\t" -" st.param.b32 [param3+4], %r14; \n\t" -" st.param.b32 [param3+8], %r14; \n\t" -" .param .b32 param4; \n\t" -" st.param.b32 [param4+0], %r15; \n\t" -" .param .b64 param5; \n\t" -" st.param.b64 [param5+0], %rd11; \n\t" -" \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaLaunchDevice, \n\t" -" ( \n\t" -" param0, \n\t" -" param1, \n\t" -" param2, \n\t" -" param3, \n\t" -" param4, \n\t" -" param5 \n\t" -" ); \n\t" -" ld.param.b32 %r8, [retval0+0]; \n\t" -" } \n\t" -" \n\t" -" // inline asm \n\t" -" \n\t" -" BB2_5: \n\t" -" // Callseq Start 4 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaDeviceSynchronize, \n\t" -" ( \n\t" -" ); \n\t" -" ld.param.b32 %r25, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 4 \n\t" -" // Callseq Start 5 \n\t" -" { \n\t" -" .reg .b32 temp_param_reg; \n\t" -" .param .b32 retval0; \n\t" -" call.uni (retval0), \n\t" -" cudaDeviceSynchronize, \n\t" -" ( \n\t" -" ); \n\t" -" ld.param.b32 %r26, [retval0+0]; \n\t" -" } \n\t" -" // Callseq End 5 \n\t" -" ret; \n\t" -); -} - -extern "C" -void mandelbrot_ispc( - __f32 param_0, - __f32 param_1, - __f32 param_2, - __f32 param_3, - __u32 param_4, - __u32 param_5, - __u32 param_6, - __u64 param_7) -{ - mandelbrot_ispc__kernel<<<1,32>>>( - param_0, - param_1, - param_2, - param_3, - param_4, - param_5, - param_6, - param_7); - cudaDeviceSynchronize(); -} diff --git a/examples_ptx/common_gpu.mk b/examples_ptx/common_gpu.mk new file mode 100644 index 00000000..46aa5190 --- /dev/null +++ b/examples_ptx/common_gpu.mk @@ -0,0 +1,68 @@ +NVCC_SRC=../nvcc_helpers.cu +NVCC_OBJS=objs_gpu/nvcc_helpers_nvcc.o +# +CXX=g++ +CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_ +# +NVCC=nvcc +NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ +# +LD=nvcc +LDFLAGS=-lcudart -lcudadevrt -arch=sm_35 +# +PTXCC=ptxcc +PTXCC_FLAGS = -maxrregcount=32 -Xptxas=-v +# +ISPC=ispc +ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math +# +# +# +ISPC_OBJS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.o) +ISPC_BCS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.bc) +ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.h) +CXX_OBJS=$(CXX_SRC:%.cpp=objs_gpu/%_gcc.o) +#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_gpu/%_nvcc.o) + +# PTXGEN = $(HOME)/ptxgen +# PTXGEN += -opt=3 +# PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 + +# .SUFFIXES: .bc .o .cu + +OBJS=$(ISPC_OBJS) $(CXX_OBJS) $(NVCC_OBJS) + +all: dirs $(PROG) $(ISPC_BCS) + +dirs: + /bin/mkdir -p objs_gpu/ + +objs_gpu/%.cpp objs_gpu/%.o objs_gpu/%.h: dirs + +clean: + echo $(CXX_OBJS) + /bin/rm -rf $(PROG) objs_gpu + +$(PROG): $(OBJS) + $(LD) -o $@ $^ $(LDFLAGS) + +objs_gpu/%_gcc.o: %.cpp $(ISPC_HEADERS) + $(CXX) $(CXXFLAGS) -o $@ -c $< +objs_gpu/%_gcc.o: ../%.cpp + $(CXX) $(CXXFLAGS) -o $@ -c $< + +objs_gpu/%_nvcc.o: ../%.cu + $(NVCC) $(NVCC_FLAGS) -o $@ -c $< +objs_gpu/%_nvcc.o: %.cu + $(NVCC) $(NVCC_FLAGS) -o $@ -c $< + +objs_gpu/%_ispc.h objs_gpu/%_ispc.bc: %.ispc + $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_gpu/$*_ispc.h -o objs_gpu/$*_ispc.bc $< + +objs_gpu/%_ispc.o: objs_gpu/%_ispc.bc + $(PTXCC) $< $(PTXCC_FLAGS) -o $@ + + + + + diff --git a/examples_ptx/mandelbrot_tasks/Makefile b/examples_ptx/mandelbrot_tasks/Makefile_cpu similarity index 100% rename from examples_ptx/mandelbrot_tasks/Makefile rename to examples_ptx/mandelbrot_tasks/Makefile_cpu diff --git a/examples_ptx/mandelbrot_tasks/Makefile_gpu b/examples_ptx/mandelbrot_tasks/Makefile_gpu index b03fe1b5..a9522e9a 100644 --- a/examples_ptx/mandelbrot_tasks/Makefile_gpu +++ b/examples_ptx/mandelbrot_tasks/Makefile_gpu @@ -1,69 +1,8 @@ PROG=mandelbrot_tasks_gpu ISPC_SRC=mandelbrot_tasks.ispc CXX_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp -NVCC_SRC=../nvcc_helpers.cu -NVCC_OBJS=objs_gpu/nvcc_helpers_nvcc.o -CXX=g++ -CXXFLAGS=-O3 -I$(CUDATK)/include -Iobjs_gpu/ -D_CUDA_ - -NVCC=nvcc -NVCC_FLAGS=-O3 -arch=sm_35 -D_CUDA_ - -LD=nvcc -LDFLAGS=-lcudart -lcudadevrt -arch=sm_35 - -PTXCC=ptxcc -PTXCC_FLAGS = -maxrregcount=32 -Xptxas=-v - -ISPC=ispc -ISPC_FLAGS=-O3 --math-lib=default --target=nvptx64 --opt=fast-math - -# PTXGEN = $(HOME)/ptxgen -# PTXGEN += -opt=3 -# PTXGEN += -ftz=1 -prec-div=0 -prec-sqrt=0 -fma=1 - - .SUFFIXES: .bc .o .cu +include ../common_gpu.mk -ISPC_OBJS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.o) -ISPC_BCS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.bc) -ISPC_HEADERS=$(ISPC_SRC:%.ispc=objs_gpu/%_ispc.h) -CXX_OBJS=$(CXX_SRC:%.cpp=objs_gpu/%_gcc.o) -#NVCC_OBJS=$(NVCC_SRC:%.cu=objs_gpu/%_nvcc.o) - -OBJS=$(ISPC_OBJS) $(CXX_OBJS) $(NVCC_OBJS) - -all: dirs $(PROG) $(ISPC_BCS) - -dirs: - /bin/mkdir -p objs_gpu/ - -objs_gpu/%.cpp objs_gpu/%.o objs_gpu/%.h: dirs - -clean: - echo $(CXX_OBJS) - /bin/rm -rf $(PROG) objs_gpu - -$(PROG): $(OBJS) - $(LD) -o $@ $^ $(LDFLAGS) - -objs_gpu/%_gcc.o: %.cpp $(ISPC_HEADERS) - $(CXX) $(CXXFLAGS) -o $@ -c $< -objs_gpu/%_gcc.o: ../%.cpp - $(CXX) $(CXXFLAGS) -o $@ -c $< - -objs_gpu/%_nvcc.o: ../%.cu - $(NVCC) $(NVCC_FLAGS) -o $@ -c $< -objs_gpu/%_nvcc.o: %.cu - $(NVCC) $(NVCC_FLAGS) -o $@ -c $< - -objs_gpu/%_ispc.h objs_gpu/%_ispc.bc: %.ispc - $(ISPC) $(ISPC_FLAGS) --emit-llvm -h objs_gpu/$*_ispc.h -o objs_gpu/$*_ispc.bc $< - -objs_gpu/%_ispc.o: objs_gpu/%_ispc.bc - $(PTXCC) $< $(PTXCC_FLAGS) -o $@ - - - diff --git a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.ispc b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.ispc index 4446cfee..17b7b7bd 100644 --- a/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.ispc +++ b/examples_ptx/mandelbrot_tasks/mandelbrot_tasks.ispc @@ -85,7 +85,7 @@ mandelbrot_ispc(uniform float x0, uniform float y0, uniform float dx = (x1 - x0) / width; uniform float dy = (y1 - y0) / height; const uniform int xspan = max(32, programCount*2); /* make sure it is big enough to avoid false-sharing */ - const uniform int yspan = 16; + const uniform int yspan = 16; #if 1 diff --git a/func.cpp b/func.cpp index 331d552e..0782d724 100644 --- a/func.cpp +++ b/func.cpp @@ -585,7 +585,8 @@ Function::GenerateIR() { if (g->mangleFunctionsWithTarget) functionName += std::string("_") + g->target->GetISAString(); - functionName += std::string("___export"); + if (g->target->getISA() == Target::NVPTX64) + functionName += std::string("___export"); llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); #if defined(LLVM_3_1)