diff --git a/builtins.cpp b/builtins.cpp index fee322e7..fbc0d5a0 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) { "__all", "__any", "__aos_to_soa3_float", + "__aos_to_soa3_float1", "__aos_to_soa3_float16", "__aos_to_soa3_float4", "__aos_to_soa3_float8", "__aos_to_soa3_int32", "__aos_to_soa4_float", + "__aos_to_soa4_float1", "__aos_to_soa4_float16", "__aos_to_soa4_float4", "__aos_to_soa4_float8", @@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_add_int64_global", "__atomic_add_uniform_int32_global", "__atomic_add_uniform_int64_global", + "__atomic_add_varying_int32_global", + "__atomic_add_varying_int64_global", "__atomic_and_int32_global", "__atomic_and_int64_global", "__atomic_and_uniform_int32_global", "__atomic_and_uniform_int64_global", + "__atomic_and_varying_int32_global", + "__atomic_and_varying_int64_global", "__atomic_compare_exchange_double_global", "__atomic_compare_exchange_float_global", "__atomic_compare_exchange_int32_global", @@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_compare_exchange_uniform_float_global", "__atomic_compare_exchange_uniform_int32_global", "__atomic_compare_exchange_uniform_int64_global", + "__atomic_compare_exchange_varying_double_global", + "__atomic_compare_exchange_varying_float_global", + "__atomic_compare_exchange_varying_int32_global", + "__atomic_compare_exchange_varying_int64_global", "__atomic_max_uniform_int32_global", "__atomic_max_uniform_int64_global", "__atomic_min_uniform_int32_global", "__atomic_min_uniform_int64_global", + "__atomic_max_varying_int32_global", + "__atomic_max_varying_int64_global", + "__atomic_min_varying_int32_global", + "__atomic_min_varying_int64_global", "__atomic_or_int32_global", "__atomic_or_int64_global", "__atomic_or_uniform_int32_global", "__atomic_or_uniform_int64_global", + "__atomic_or_varying_int32_global", + "__atomic_or_varying_int64_global", "__atomic_sub_int32_global", "__atomic_sub_int64_global", "__atomic_sub_uniform_int32_global", "__atomic_sub_uniform_int64_global", + "__atomic_sub_varying_int32_global", + "__atomic_sub_varying_int64_global", "__atomic_swap_double_global", "__atomic_swap_float_global", "__atomic_swap_int32_global", @@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_swap_uniform_float_global", "__atomic_swap_uniform_int32_global", "__atomic_swap_uniform_int64_global", + "__atomic_swap_varying_double_global", + "__atomic_swap_varying_float_global", + "__atomic_swap_varying_int32_global", + "__atomic_swap_varying_int64_global", "__atomic_umax_uniform_uint32_global", "__atomic_umax_uniform_uint64_global", "__atomic_umin_uniform_uint32_global", "__atomic_umin_uniform_uint64_global", + "__atomic_umax_varying_uint32_global", + "__atomic_umax_varying_uint64_global", + "__atomic_umin_varying_uint32_global", + "__atomic_umin_varying_uint64_global", "__atomic_xor_int32_global", "__atomic_xor_int64_global", "__atomic_xor_uniform_int32_global", "__atomic_xor_uniform_int64_global", + "__atomic_xor_uniform_int32_global", + "__atomic_xor_uniform_int64_global", + "__atomic_xor_varying_int32_global", + "__atomic_xor_varying_int64_global", + "__atomic_xor_varying_int32_global", + "__atomic_xor_varying_int64_global", "__broadcast_double", "__broadcast_float", "__broadcast_i16", @@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) { "__do_assert_uniform", "__do_assert_varying", "__do_print", + "__do_print_nvptx", "__doublebits_uniform_int64", "__doublebits_varying_int64", "__exclusive_scan_add_double", @@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) { "__extract_int32", "__extract_int64", "__extract_int8", + "__extract_float", + "__extract_double", "__fastmath", "__float_to_half_uniform", "__float_to_half_varying", @@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) { "__insert_int32", "__insert_int64", "__insert_int8", + "__insert_float", + "__insert_double", "__intbits_uniform_double", "__intbits_uniform_float", "__intbits_varying_double", @@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) { "__min_varying_uint32", "__min_varying_uint64", "__movmsk", + "__movmsk_ptx", "__new_uniform_32rt", "__new_uniform_64rt", "__new_varying32_32rt", @@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) { "__shuffle_i64", "__shuffle_i8", "__soa_to_aos3_float", + "__soa_to_aos3_float1", "__soa_to_aos3_float16", "__soa_to_aos3_float4", "__soa_to_aos3_float8", "__soa_to_aos3_int32", "__soa_to_aos4_float", + "__soa_to_aos4_float1", "__soa_to_aos4_float16", "__soa_to_aos4_float4", "__soa_to_aos4_float8", @@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) { "__vec4_add_int32", "__vselect_float", "__vselect_i32", + "__program_index", + "__program_count", + "__warp_index", + "__task_index0", + "__task_index1", + "__task_index2", + "__task_index", + "__task_count0", + "__task_count1", + "__task_count2", + "__task_count", + "__cvt_loc2gen", + "__cvt_loc2gen_var", + "__cvt_const2gen", + "__puts_nvptx", + "ISPCAlloc", + "ISPCLaunch", + "ISPCSync", }; int count = sizeof(names) / sizeof(names[0]); @@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length, g->target->getISA() != Target::NEON16 && g->target->getISA() != Target::NEON8) #endif // !__arm__ + if (g->target->getISA() != Target::NVPTX) { Assert(bcTriple.getArch() == llvm::Triple::UnknownArch || mTriple.getArch() == bcTriple.getArch()); @@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod // Next, add the target's custom implementations of the various needed // builtin functions (e.g. __masked_store_32(), etc). switch (g->target->getISA()) { - + case Target::NVPTX: + { + if (runtime32) { + fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n"); + assert(0); + } + else { + EXPORT_MODULE(builtins_bitcode_nvptx_64bit); + } + break; + }; #ifdef ISPC_ARM_ENABLED case Target::NEON8: { if (runtime32) { @@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod } // define the 'programCount' builtin variable - lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable); + if (g->target->getISA() != Target::NVPTX) + { + lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable); + } + else + { + lDefineConstantInt("programCount", 32, module, symbolTable); + } // define the 'programIndex' builtin lDefineProgramIndex(module, symbolTable); @@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(), module, symbolTable); + lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX), + module, symbolTable); + if (g->forceAlignment != -1) { llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true); alignment->setInitializer(LLVMInt32(g->forceAlignment)); diff --git a/builtins/__do_print_nvptx.cu b/builtins/__do_print_nvptx.cu new file mode 100644 index 00000000..dc1bbcce --- /dev/null +++ b/builtins/__do_print_nvptx.cu @@ -0,0 +1,130 @@ +#include + +#define PRINT_BUF_SIZE 4096 +#define uint64_t unsigned long long + +static __device__ size_t d_strlen(const char *str) +{ + const char *s; + + for (s = str; *s; ++s) + ; + return (s - str); +} + +static __device__ char* d_strncat(char *dest, const char *src, size_t n) +{ + size_t dest_len = d_strlen(dest); + size_t i; + + for (i = 0 ; i < n && src[i] != '\0' ; i++) + dest[dest_len + i] = src[i]; + dest[dest_len + i] = '\0'; + + return dest; +} + +#define APPEND(str) \ + do { \ + int offset = bufp - &printString[0]; \ + *bufp = '\0'; \ + d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \ + bufp += d_strlen(str); \ + if (bufp >= &printString[PRINT_BUF_SIZE]) \ + goto done; \ + } while (0) /* eat semicolon */ + + +#define PRINT_SCALAR(fmt, type) \ + sprintf(tmpBuf, fmt, *((type *)ptr)); \ + APPEND(tmpBuf); \ + break + +#define PRINT_VECTOR(fmt, type) \ + *bufp++ = '['; \ + if (bufp == &printString[PRINT_BUF_SIZE]) break; \ + for (int i = 0; i < width; ++i) { \ + /* only print the value if the current lane is executing */ \ + type val0 = *((type*)ptr); \ + type val = val0; \ + if (mask & (1ull< + %in0 = extractelement <2 x i32> %in, i32 0 + %in1 = extractelement <2 x i32> %in, i32 1 + %out0 = tail call i32 @$1_i32_nvptx(i32 %in0, i32 %1) + %out1 = tail call i32 @$1_i32_nvptx(i32 %in1, i32 %1) + %out2 = insertelement <2 x i32> undef, i32 %out0, i32 0 + %out = insertelement <2 x i32> %out2, i32 %out1, i32 1 + %ret = bitcast <2 x i32> %out to $2 + ret $2 %ret +} +') +shfl64(__shfl, i64) +shfl64(__shfl_xor, i64) +shfl64(__shfl, double) +shfl64(__shfl_xor, double) + +;;;;;;;;;;;;; +define internal i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline +{ + %conv = zext i1 %0 to i32 + %res = tail call i32 asm sideeffect + "{ .reg .pred %p1; + setp.ne.u32 %p1, $1, 0; + vote.ballot.b32 $0, %p1; + }", "=r,r"(i32 %conv) nounwind readnone alwaysinline + ret i32 %res +} +define internal i32 @__lanemask_lt_nvptx() nounwind readnone alwaysinline +{ + %mask = tail call i32 asm sideeffect "mov.u32 $0, %lanemask_lt;", "=r"() nounwind readnone alwaysinline + ret i32 %mask +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; tasking + +;; this call allocate parameter buffer for kernel launch +declare i64 @cudaGetParameterBuffer(i64, i64) nounwind +define i8* @ISPCAlloc(i8**, i64 %size, i32 %align32) nounwind alwaysinline +{ +entry: + %and = call i32 @__program_index() + %cmp = icmp eq i32 %and, 0 + %align = zext i32 %align32 to i64 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %ptri64tmp = call i64 @cudaGetParameterBuffer(i64 %align, i64 %size); + br label %if.end + +if.end: + %ptri64 = phi i64 [ %ptri64tmp, %if.then ], [ 0, %entry ] + %ptr = inttoptr i64 %ptri64 to i8* + ret i8* %ptr +} + +;; this actually launches kernel a kernel +module asm " +.extern .func (.param .b32 func_retval0) cudaLaunchDevice +( + .param .b64 cudaLaunchDevice_param_0, + .param .b64 cudaLaunchDevice_param_1, + .param .align 4 .b8 cudaLaunchDevice_param_2[12], + .param .align 4 .b8 cudaLaunchDevice_param_3[12], + .param .b32 cudaLaunchDevice_param_4, + .param .b64 cudaLaunchDevice_param_5 +); +" +define void @ISPCLaunch(i8**, i8* %func_ptr, i8* %func_args, i32 %ntx, i32 %nty, i32 %ntz) nounwind alwaysinline +{ +entry: +;; only 1 lane must launch the kernel !!! + %func_i64 = ptrtoint i8* %func_ptr to i64 + %args_i64 = ptrtoint i8* %func_args to i64 + +;; nbx = (%ntx-1)/(blocksize/warpsize) + 1 for blocksize=128 & warpsize=32 + %ntxm1 = add nsw i32 %ntx, -1 +;; %ntxm1d4 = sdiv i32 %ntxm1, 4 + %ntxm1d4 = ashr i32 %ntxm1, 2 + %nbx = add nsw i32 %ntxm1d4, 1 + %and = call i32 @__program_index() +;; if (laneIdx == 0) + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + + %res_tmp = call i32 asm sideeffect "{ + .param .b64 param0; + st.param.b64 [param0+0], $1; + .param .b64 param1; + st.param.b64 [param1+0], $2; + .param .align 4 .b8 param2[12]; + st.param.b32 [param2+0], $3; + st.param.b32 [param2+4], $4; + st.param.b32 [param2+8], $5; + .param .align 4 .b8 param3[12]; + st.param.b32 [param3+0], $6; + st.param.b32 [param3+4], $7; + st.param.b32 [param3+8], $8; + .param .b32 param4; + st.param.b32 [param4+0], $9; + .param .b64 param5; + st.param.b64 [param5+0], $10; + + .param .b32 retval0; + call.uni (retval0), + cudaLaunchDevice, + ( + param0, + param1, + param2, + param3, + param4, + param5 + ); + ld.param.b32 $0, [retval0+0]; + } + ", +"=r, l,l, r,r,r, r,r,r, r,l"( + i64 %func_i64,i64 %args_i64, + i32 %nbx,i32 %nty,i32 %ntz, + i32 128,i32 1,i32 1, i32 0,i64 0); + br label %if.end + +if.end: ; preds = %if.then, %entry +;; %res = phi i32 [ %res_tmp, %if.then ], [ undef, %entry ] + + ret void +} + +;; this synchronizes kernel +declare i32 @cudaDeviceSynchronize() nounwind +define void @ISPCSync(i8*) nounwind alwaysinline +{ + call i32 @cudaDeviceSynchronize() + ret void; +} + + +;;;;;;;;;;;;;; + + + +include(`util-nvptx.m4') + +stdlib_core() +packed_load_and_store() +int64minmax() +rdrand_decls() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; broadcast/rotate/shuffle + +define_shuffles() + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aos/soa + +aossoa() + +;; dummy 1 wide vector ops +declare void +@__aos_to_soa4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline ; + +declare void +@__soa_to_aos4_float1(<1 x float> %v0, <1 x float> %v1, <1 x float> %v2, + <1 x float> %v3, <1 x float> * noalias %out0, + <1 x float> * noalias %out1, <1 x float> * noalias %out2, + <1 x float> * noalias %out3) nounwind alwaysinline ; + +declare void +@__aos_to_soa3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2); + +declare void +@__soa_to_aos3_float1(<1 x float> %v0, <1 x float> %v1, + <1 x float> %v2, <1 x float> * %out0, <1 x float> * %out1, + <1 x float> * %out2); + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; half conversion routines + +declare float @llvm.convert.from.fp16(i16) nounwind readnone +declare i16 @llvm.convert.to.fp16(float) nounwind readnone +define float @__half_to_float_uniform(i16 %v) nounwind readnone alwaysinline +{ + ;; %res = call float @llvm.convert.from.fp16(i16 %v) + %res = tail call float asm sideeffect + "{ .reg .f16 tmp; + mov.b16 tmp, $1; + cvt.f32.f16 $0, tmp; + }", "=f,h"(i16 %v) nounwind readnone alwaysinline + ret float %res +} +define i16 @__float_to_half_uniform(float %v) nounwind readnone alwaysinline +{ + ;; this will break the compiler, use inline asm similarly to above case + ;; %half = call i16 @llvm.convert.to.fp16(float %v) + %half = tail call i16 asm sideeffect + "{ .reg .f16 tmp; + cvt.rn.f16.f32 tmp, $1; + mov.b16 $0, tmp; + }", "=h,f"(float %v) nounwind readnone alwaysinline + ret i16 %half +} +define @__half_to_float_varying( %v) nounwind readnone alwaysinline +{ + %el = extractelement <1 x i16> %v, i32 0 + %sf = call float @__half_to_float_uniform(i16 %el) + %vf = insertelement <1 x float> undef, float %sf, i32 0 + ret <1 x float> %vf; +} +define @__float_to_half_varying( %v) nounwind readnone alwaysinline +{ + %el = extractelement <1 x float> %v, i32 0 + %sh = call i16 @__float_to_half_uniform(float %el) + %vh = insertelement <1 x i16> undef, i16 %sh, i32 0 + ret <1 x i16> %vh; +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; math + +declare void @__fastmath() nounwind + +;; round/floor/ceil + +define internal float @__round_uniform_float_ptx(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect + "{ .reg .pred p<3>; .reg .s32 r<4>; .reg .f32 f<10>; + mov.f32 f4, $1; + abs.f32 f5, f4; + mov.b32 r1, f4; + and.b32 r2, r1, -2147483648; + or.b32 r3, r2, 1056964608; + mov.b32 f6, r3; + add.f32 f7, f6, f4; + cvt.rzi.f32.f32 f8, f7; + setp.gt.f32 p1, f5, 0f4B000000; + selp.f32 f9, f4, f8, p1; + setp.geu.f32 p2, f5, 0f3F000000; + @p2 bra BB2_2; + cvt.rzi.f32.f32 f9, f4; +BB2_2: + mov.f32 $0, f9; + }", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %2 +} +define float @__round_uniform_float(float) nounwind readonly alwaysinline { + %float_to_int_bitcast.i.i.i.i = bitcast float %0 to <1 x i32> + %bitop.i.i = and <1 x i32> %float_to_int_bitcast.i.i.i.i, + %bitop.i = xor <1 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i40.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %int_to_float_bitcast.i.i40.i, + %binop21.i = fadd <1 x float> %binop.i, + %float_to_int_bitcast.i.i.i = bitcast <1 x float> %binop21.i to <1 x i32> + %bitop31.i = xor <1 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop31.i to float + ret float %int_to_float_bitcast.i.i.i +} +define float @__floor_uniform_float(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect "cvt.rmi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + ret float %2 +} +define float @__ceil_uniform_float(float) nounwind readnone alwaysinline +{ + %2 = tail call float asm sideeffect "cvt.rpi.f32.f32 $0, $1;", "=f,f"(float %0) nounwind alwaysinline readnone + ret float %2 +} + +define double @__round_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect + "{ + .reg .pred p<3>; + .reg .s32 r<6>; + .reg .f64 fd<9>; + + mov.f64 fd8, $1 + abs.f64 fd1, fd8; + setp.ge.f64 p1, fd1, 0d4330000000000000; + @p1 bra BB5_2; + + add.f64 fd5, fd1, 0d3FE0000000000000; + cvt.rzi.f64.f64 fd6, fd5; + setp.lt.f64 p2, fd1, 0d3FE0000000000000; + selp.f64 fd7, 0d0000000000000000, fd6, p2; + { + .reg .b32 temp; + mov.b64 {r1, temp}, fd7; + } + { + .reg .b32 temp; + mov.b64 {temp, r2}, fd7; + } + { + .reg .b32 temp; + mov.b64 {temp, r3}, fd8; + } + and.b32 r4, r3, -2147483648; + or.b32 r5, r2, r4; + mov.b64 fd8, {r1, r5}; + +BB5_2: + mov.f64 $0, fd8; + }", "=d,d"(double %0) nounwind readnone alwaysinline + ret double %2 +} +define double @__floor_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect "cvt.rmi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + ret double %2 +} +define double @__ceil_uniform_double(double) nounwind readnone alwaysinline +{ + %2 = tail call double asm sideeffect "cvt.rpi.f64.f64 $0, $1;", "=f,f"(double %0) nounwind alwaysinline readnone + ret double %2 +} + +define internal <1 x float> @__floor_varying_floatX(<1 x float>) nounwind readonly alwaysinline { + %calltmp.i = tail call <1 x float> @__round_varying_float(<1 x float> %0) nounwind + %bincmp.i = fcmp ogt <1 x float> %calltmp.i, %0 + %val_to_boolvec32.i = sext <1 x i1> %bincmp.i to <1 x i32> + %bitop.i = and <1 x i32> %val_to_boolvec32.i, + %int_to_float_bitcast.i.i.i = bitcast <1 x i32> %bitop.i to <1 x float> + %binop.i = fadd <1 x float> %calltmp.i, %int_to_float_bitcast.i.i.i + ret <1 x float> %binop.i +} + +define(`rfc_varying',` +define <1 x $2> @__$1_varying_$2(<1 x $2>) nounwind readonly alwaysinline +{ + %val = extractelement <1 x $2> %0, i32 0 + %res = call $2 @__$1_uniform_$2($2 %val) + %ret = insertelement <1 x $2> undef, $2 %res, i32 0 + ret <1 x $2> %ret +} +') +rfc_varying(round, float) +rfc_varying(floor, float) +rfc_varying(ceil, float) +rfc_varying(round, double) +rfc_varying(floor, double) +rfc_varying(ceil, double) + +;; min/max uniform + +;; declare float @__max_uniform_float(float, float) nounwind readnone +;; declare float @__min_uniform_float(float, float) nounwind readnone +define float @__max_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp ogt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} +define float @__min_uniform_float(float, float) nounwind readonly alwaysinline { + %d = fcmp olt float %0, %1 + %r = select i1 %d, float %0, float %1 + ret float %r + +} + +;; declare i32 @__min_uniform_int32(i32, i32) nounwind readnone +;; declare i32 @__max_uniform_int32(i32, i32) nounwind readnone +define i32 @__min_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp slt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} +define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline { + %c = icmp sgt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +;; declare i32 @__min_uniform_uint32(i32, i32) nounwind readnone +;; declare i32 @__max_uniform_uint32(i32, i32) nounwind readnone +define i32 @__min_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ult i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} +define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline { + %c = icmp ugt i32 %0, %1 + %r = select i1 %c, i32 %0, i32 %1 + ret i32 %r +} + +;; declare i64 @__min_uniform_int64(i64, i64) nounwind readnone +;; declare i64 @__max_uniform_int64(i64, i64) nounwind readnone +define internal i64 @__min_uniform_int64X(i64, i64) nounwind readonly alwaysinline { + %c = icmp slt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} +define internal i64 @__max_uniform_int64X(i64, i64) nounwind readonly alwaysinline { + %c = icmp sgt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +;; declare i64 @__min_uniform_uint64(i64, i64) nounwind readnone +;; declare i64 @__max_uniform_uint64(i64, i64) nounwind readnone +define internal i64 @__min_uniform_uint64X(i64, i64) nounwind readonly alwaysinline { + %c = icmp ult i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} +define internal i64 @__max_uniform_uint64X(i64, i64) nounwind readonly alwaysinline { + %c = icmp ugt i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define double @__max_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp ogt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r +} +define double @__min_uniform_double(double, double) nounwind readonly alwaysinline { + %d = fcmp olt double %0, %1 + %r = select i1 %d, double %0, double %1 + ret double %r +} + +;; min/max uniform + + +define(`minmax_vy',` +define <1 x $2> @__$1_varying_$3(<1 x $2>, <1 x $2>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x $2> %0, i32 0 + %v1 = extractelement <1 x $2> %1, i32 0 + %r = call $2 @__$1_uniform_$3($2 %v0, $2 %v1) + %ret = insertelement <1 x $2> undef, $2 %r, i32 0 + ret <1 x $2> %ret; +} +') +minmax_vy(min, i32, int32) +minmax_vy(max, i32, int32) +minmax_vy(min, i32, uint32) +minmax_vy(max, i32, uint32) +minmax_vy(min, float, float) +minmax_vy(max, float, float) +minmax_vy(min, double, double) +minmax_vy(max, double, double) + +;; sqrt/rsqrt/rcp + +declare float @llvm.nvvm.rsqrt.approx.f(float %f) nounwind readonly alwaysinline +declare float @llvm.nvvm.sqrt.f(float %f) nounwind readonly alwaysinline +declare double @llvm.nvvm.rsqrt.approx.d(double %f) nounwind readonly alwaysinline +declare double @llvm.sqrt.f64(double %f) nounwind readonly alwaysinline + +;; declare float @__rcp_uniform_float(float) nounwind readnone +define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { +; uniform float iv = extract(__rcp_u(v), 0); +; return iv * (2. - v * iv); + %ret = fdiv float 1.,%0 +; %ret = tail call float asm sideeffect "rcp.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %ret +} +;; declare float @__sqrt_uniform_float(float) nounwind readnone +define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline { + %ret = call float @llvm.nvvm.sqrt.f(float %0) +; %ret = tail call float asm sideeffect "sqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %ret +} +;; declare float @__rsqrt_uniform_float(float) nounwind readnone +define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline +{ + %ret = call float @llvm.nvvm.rsqrt.approx.f(float %0) +; %ret = tail call float asm sideeffect "rsqrt.approx.ftz.f32 $0, $1;", "=f,f"(float %0) nounwind readnone alwaysinline + ret float %ret +} + +define @__rcp_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__rcp_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} +define @__rsqrt_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__rsqrt_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} +define @__sqrt_varying_float() nounwind readnone alwaysinline +{ + %v = extractelement <1 x float> %0, i32 0 + %r = call float @__sqrt_uniform_float(float %v) + %rv = insertelement <1 x float> undef, float %r, i32 0 + ret %rv +} + +;; declare double @__sqrt_uniform_double(double) nounwind readnone +define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { + %ret = call double @llvm.sqrt.f64(double %0) + ret double %ret +} +define @__sqrt_varying_double() nounwind readnone alwaysinline +{ + %v = extractelement <1 x double> %0, i32 0 + %r = call double @__sqrt_uniform_double(double %v) + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret %rv +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; population count + +declare i32 @llvm.ctpop.i32(i32) nounwind readnone +define i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { + %call = call i32 @llvm.ctpop.i32(i32 %0) + ret i32 %call +;; %res = tail call i32 asm sideeffect "popc.b32 $0, $1;", "=r,r"(i32 %0) nounwind readnone alwaysinline + ;; ret i32 %res +} + +declare i64 @llvm.ctpop.i64(i64) nounwind readnone +define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; binary prefix sum + +define internal i64 @__warpBinExclusiveScan(i1 %p) nounwind readonly alwaysinline +{ +entry: + %call = call i32 @__ballot_nvptx(i1 zeroext %p) + %call1 = call i32 @__popcnt_int32(i32 %call) + %call2 = call i32 @__lanemask_lt_nvptx() + %and = and i32 %call2, %call + %call3 = call i32 @__popcnt_int32(i32 %and) + %retval.sroa.1.4.insert.ext.i = zext i32 %call3 to i64 + %retval.sroa.1.4.insert.shift.i = shl nuw i64 %retval.sroa.1.4.insert.ext.i, 32 + %retval.sroa.0.0.insert.ext.i = zext i32 %call1 to i64 + %retval.sroa.0.0.insert.insert.i = or i64 %retval.sroa.1.4.insert.shift.i, %retval.sroa.0.0.insert.ext.i + ret i64 %retval.sroa.0.0.insert.insert.i +} + +ctlztz() + +; FIXME: need either to wire these up to the 8-wide SVML entrypoints, +; or, use the macro to call the 4-wide ones twice with our 8-wide +; vectors... + +;; svml is not support in PTX, will generate linking error + +include(`svml.m4') +svml_stubs(float,f,WIDTH) +svml_stubs(double,d,WIDTH) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reductions + +define i64 @__movmsk(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %v64 = zext i1 %v to i64 + ret i64 %v64 +} +define i64 @__movmsk_ptx(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %v0 = call i32 @__ballot_nvptx(i1 %v) + %v64 = zext i32 %v0 to i64 + ret i64 %v64 +} + +define i1 @__any(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %res = call i32 @__ballot_nvptx(i1 %v) + %cmp = icmp ne i32 %res, 0 + ret i1 %cmp +} + +define i1 @__all(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %res0 = call i32 @__ballot_nvptx(i1 %v) + %cmp = icmp eq i32 %res0, -1 + ret i1 %cmp +} + +define i1 @__none(<1 x i1>) nounwind readnone alwaysinline { + %v = extractelement <1 x i1> %0, i32 0 + %res = call i32 @__ballot_nvptx(i1 %v) + %cmp = icmp eq i32 %res, 0 + ret i1 %cmp +} + +;;;;;;;;; reductions i8 +define i16 @__reduce_add_int8(<1 x i8> %v) nounwind readnone alwaysinline { + %value8 = extractelement <1 x i8> %v, i32 0 + %value = zext i8 %value8 to i16 + %call = tail call i16 @__shfl_xor_i16_nvptx(i16 %value, i32 16) + %call1 = add i16 %call, %value + %call.1 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1, i32 8) + %call1.1 = add i16 %call1, %call.1 + %call.2 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.1, i32 4) + %call1.2 = add i16 %call1.1, %call.2 + %call.3 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.2, i32 2) + %call1.3 = add i16 %call1.2, %call.3 + %call.4 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.3, i32 1) + %call1.4 = add i16 %call1.3, %call.4 + ret i16 %call1.4 +} +;;;;;;;;; reductions i16 +define i32 @__reduce_add_int16(<1 x i16> %v) nounwind readnone alwaysinline { + %value16 = extractelement <1 x i16> %v, i32 0 + %value = zext i16 %value16 to i32 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = add i32 %call, %value + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 = add i32 %call1, %call.1 + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = add i32 %call1.1, %call.2 + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = add i32 %call1.2, %call.3 + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = add i32 %call1.3, %call.4 + ret i32 %call1.4 +} + +;;;;;;;;; reductions float +define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline { + %value = extractelement <1 x float> %v, i32 0 + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = fadd float %call, %value + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = fadd float %call1, %call.1 + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = fadd float %call1.1, %call.2 + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = fadd float %call1.2, %call.3 + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = fadd float %call1.3, %call.4 + ret float %call1.4 +} +define float @__reduce_min_float(<1 x float>) nounwind readnone alwaysinline { + %value = extractelement <1 x float> %0, i32 0 + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = tail call float @__fminf_nvptx(float %value, float %call) + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = tail call float @__fminf_nvptx(float %call1, float %call.1) + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = tail call float @__fminf_nvptx(float %call1.1, float %call.2) + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = tail call float @__fminf_nvptx(float %call1.2, float %call.3) + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4) + ret float %call1.4 +} +define float @__reduce_max_float(<1 x float>) nounwind readnone alwaysinline { + %value = extractelement <1 x float> %0, i32 0 + %call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16) + %call1 = tail call float @__fmaxf_nvptx(float %value, float %call) + %call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8) + %call1.1 = tail call float @__fmaxf_nvptx(float %call1, float %call.1) + %call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4) + %call1.2 = tail call float @__fmaxf_nvptx(float %call1.1, float %call.2) + %call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2) + %call1.3 = tail call float @__fmaxf_nvptx(float %call1.2, float %call.3) + %call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1) + %call1.4 = tail call float @__fmaxf_nvptx(float %call1.3, float %call.4) + ret float %call1.4 +} + +;;;;;;;;; reductions int32 +define i32 @__reduce_add_int32(<1 x i32>) nounwind readnone alwaysinline { + %value = extractelement <1 x i32> %0, i32 0 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = add i32 %call, %value + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 =add i32 %call1, %call.1 + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = add i32 %call1.1, %call.2 + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = add i32 %call1.2, %call.3 + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = add i32 %call1.3, %call.4 + ret i32 %call1.4 +} +define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone alwaysinline { + %value = extractelement <1 x i32> %0, i32 0 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = tail call i32 @__min_i32_signed(i32 %value, i32 %call) + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 = tail call i32 @__min_i32_signed(i32 %call1, i32 %call.1) + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = tail call i32 @__min_i32_signed(i32 %call1.1, i32 %call.2) + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = tail call i32 @__min_i32_signed(i32 %call1.2, i32 %call.3) + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = tail call i32 @__min_i32_signed(i32 %call1.3, i32 %call.4) + ret i32 %call1.4 +} +define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone alwaysinline { + %value = extractelement <1 x i32> %0, i32 0 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = tail call i32 @__max_i32_signed(i32 %value, i32 %call) + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 = tail call i32 @__max_i32_signed(i32 %call1, i32 %call.1) + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = tail call i32 @__max_i32_signed(i32 %call1.1, i32 %call.2) + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = tail call i32 @__max_i32_signed(i32 %call1.2, i32 %call.3) + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = tail call i32 @__max_i32_signed(i32 %call1.3, i32 %call.4) + ret i32 %call1.4 +} + +;;;;;;;;; reductions uint32 +define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone alwaysinline { + %value = extractelement <1 x i32> %0, i32 0 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = tail call i32 @__min_i32_unsigned(i32 %value, i32 %call) + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 = tail call i32 @__min_i32_unsigned(i32 %call1, i32 %call.1) + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = tail call i32 @__min_i32_unsigned(i32 %call1.1, i32 %call.2) + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = tail call i32 @__min_i32_unsigned(i32 %call1.2, i32 %call.3) + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = tail call i32 @__min_i32_unsigned(i32 %call1.3, i32 %call.4) + ret i32 %call1.4 +} +define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone alwaysinline { + %value = extractelement <1 x i32> %0, i32 0 + %call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16) + %call1 = tail call i32 @__max_i32_unsigned(i32 %value, i32 %call) + %call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8) + %call1.1 = tail call i32 @__max_i32_unsigned(i32 %call1, i32 %call.1) + %call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4) + %call1.2 = tail call i32 @__max_i32_unsigned(i32 %call1.1, i32 %call.2) + %call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2) + %call1.3 = tail call i32 @__max_i32_unsigned(i32 %call1.2, i32 %call.3) + %call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1) + %call1.4 = tail call i32 @__max_i32_unsigned(i32 %call1.3, i32 %call.4) + ret i32 %call1.4 + } + +;;;;;;;;; reductions double +define double @__reduce_add_double(<1 x double>) nounwind readnone alwaysinline { + %value = extractelement <1 x double> %0, i32 0 + %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16) + %call1 = fadd double %call, %value + %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8) + %call1.1 = fadd double %call1, %call.1 + %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4) + %call1.2 = fadd double %call1.1, %call.2 + %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2) + %call1.3 = fadd double %call1.2, %call.3 + %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1) + %call1.4 = fadd double %call1.3, %call.4 + ret double %call1.4 +} +define double @__reduce_min_double(<1 x double>) nounwind readnone alwaysinline { + %value = extractelement <1 x double> %0, i32 0 + %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16) + %call1 = tail call double @__min_double(double %value, double %call) + %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8) + %call1.1 = tail call double @__min_double(double %call1, double %call.1) + %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4) + %call1.2 = tail call double @__min_double(double %call1.1, double %call.2) + %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2) + %call1.3 = tail call double @__min_double(double %call1.2, double %call.3) + %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1) + %call1.4 = tail call double @__min_double(double %call1.3, double %call.4) + ret double %call1.4 +} +define double @__reduce_max_double(<1 x double>) nounwind readnone alwaysinline { + %value = extractelement <1 x double> %0, i32 0 + %call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16) + %call1 = tail call double @__max_double(double %value, double %call) + %call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8) + %call1.1 = tail call double @__max_double(double %call1, double %call.1) + %call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4) + %call1.2 = tail call double @__max_double(double %call1.1, double %call.2) + %call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2) + %call1.3 = tail call double @__max_double(double %call1.2, double %call.3) + %call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1) + %call1.4 = tail call double @__max_double(double %call1.3, double %call.4) + ret double %call1.4 +} + + +;;;;;;;;; reductions int64 +define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone alwaysinline { + %value = extractelement <1 x i64> %0, i32 0 + %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16) + %call1 = add i64 %call, %value + %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8) + %call1.1 =add i64 %call1, %call.1 + %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4) + %call1.2 = add i64 %call1.1, %call.2 + %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2) + %call1.3 = add i64 %call1.2, %call.3 + %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1) + %call1.4 = add i64 %call1.3, %call.4 + ret i64 %call1.4 +} +define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone alwaysinline { + %value = extractelement <1 x i64> %0, i32 0 + %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16) + %call1 = tail call i64 @__min_i64_signed(i64 %value, i64 %call) + %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8) + %call1.1 = tail call i64 @__min_i64_signed(i64 %call1, i64 %call.1) + %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4) + %call1.2 = tail call i64 @__min_i64_signed(i64 %call1.1, i64 %call.2) + %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2) + %call1.3 = tail call i64 @__min_i64_signed(i64 %call1.2, i64 %call.3) + %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1) + %call1.4 = tail call i64 @__min_i64_signed(i64 %call1.3, i64 %call.4) + ret i64 %call1.4 +} +define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone alwaysinline { + %value = extractelement <1 x i64> %0, i32 0 + %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16) + %call1 = tail call i64 @__max_i64_signed(i64 %value, i64 %call) + %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8) + %call1.1 = tail call i64 @__max_i64_signed(i64 %call1, i64 %call.1) + %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4) + %call1.2 = tail call i64 @__max_i64_signed(i64 %call1.1, i64 %call.2) + %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2) + %call1.3 = tail call i64 @__max_i64_signed(i64 %call1.2, i64 %call.3) + %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1) + %call1.4 = tail call i64 @__max_i64_signed(i64 %call1.3, i64 %call.4) + ret i64 %call1.4 +} +define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone alwaysinline { + %value = extractelement <1 x i64> %0, i32 0 + %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16) + %call1 = tail call i64 @__min_i64_unsigned(i64 %value, i64 %call) + %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8) + %call1.1 = tail call i64 @__min_i64_unsigned(i64 %call1, i64 %call.1) + %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4) + %call1.2 = tail call i64 @__min_i64_unsigned(i64 %call1.1, i64 %call.2) + %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2) + %call1.3 = tail call i64 @__min_i64_unsigned(i64 %call1.2, i64 %call.3) + %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1) + %call1.4 = tail call i64 @__min_i64_unsigned(i64 %call1.3, i64 %call.4) + ret i64 %call1.4 +} +define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone alwaysinline { + %value = extractelement <1 x i64> %0, i32 0 + %call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16) + %call1 = tail call i64 @__max_i64_unsigned(i64 %value, i64 %call) + %call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8) + %call1.1 = tail call i64 @__max_i64_unsigned(i64 %call1, i64 %call.1) + %call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4) + %call1.2 = tail call i64 @__max_i64_unsigned(i64 %call1.1, i64 %call.2) + %call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2) + %call1.3 = tail call i64 @__max_i64_unsigned(i64 %call1.2, i64 %call.3) + %call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1) + %call1.4 = tail call i64 @__max_i64_unsigned(i64 %call1.3, i64 %call.4) + ret i64 %call1.4 +} + +;;;; reduce equal, must be tested and may fail if data has -1 +define internal i32 @__shfl_reduce_and_step_i32_nvptx(i32, i32) nounwind readnone alwaysinline +{ + %shfl = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.bfly.b32 r0|p, $1, $2, 0; + @p and.b32 r0, r0, $3; + mov.u32 $0, r0; + }", "=r,r,r,r"(i32 %0, i32 %1, i32 %0) nounwind readnone alwaysinline + ret i32 %shfl +} +shfl64(__shfl_reduce_and_step, i64) + +define internal i32 @__reduce_and_i32(i32 %v0, i1 %mask) nounwind readnone alwaysinline +{ + %v = select i1 %mask, i32 %v0, i32 -1 + %s1 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %v, i32 16); + %s2 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s1, i32 8); + %s3 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s2, i32 4); + %s4 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s3, i32 2); + %s5 = tail call i32 @__shfl_reduce_and_step_i32_nvptx(i32 %s4, i32 1); + ret i32 %s5 +} +define internal i64 @__reduce_and_i64(i64, i1) nounwind readnone alwaysinline +{ + %v = bitcast i64 %0 to <2 x i32> + %v0 = extractelement <2 x i32> %v, i32 0 + %v1 = extractelement <2 x i32> %v, i32 1 + %s0 = call i32 @__reduce_and_i32(i32 %v0, i1 %1) + %s1 = call i32 @__reduce_and_i32(i32 %v1, i1 %1) + %tmp = insertelement <2 x i32> undef, i32 %s0, i32 0 + %res = insertelement <2 x i32> %tmp, i32 %s1, i32 1 + %ret = bitcast <2 x i32> %res to i64 + ret i64 %ret; +} + +define(`reduce_equal',` +define i1 @__reduce_equal_$2(<1 x $1> %v0, $1 * %samevalue, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %vv = bitcast <1 x $1> %v0 to <1 x $3> + %sv = extractelement <1 x $3> %vv, i32 0 + %mask = extractelement <1 x i1> %maskv, i32 0 + + %s = call $3 @__reduce_and_$3($3 %sv, i1 %mask); + + ;; find last active lane + %nact = call i32 @__ballot_nvptx(i1 %mask) + %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact) + %lane = sub i32 31, %lane1 + + ;; broadcast result from this lane + %r = tail call $3 @__shfl_$3_nvptx($3 %s, i32 %lane) + + ;; compare result to the original value + %c0 = icmp eq $3 %r, %sv + %c1 = and i1 %c0, %mask + %neq = call i32 @__ballot_nvptx(i1 %c1) + %cmp = icmp eq i32 %neq, %nact + + br i1 %cmp, label %all_equal, label %all_not_equal + +all_equal: + %vstore = bitcast $3 %r to $1 + store $1 %vstore, $1* %samevalue; + ret i1 true + +all_not_equal: + ret i1 false + +} +') +reduce_equal(i32, int32, i32); +reduce_equal(i64, int64, i64); +reduce_equal(float, float, i32); +reduce_equal(double, double, i64); + +;;;;;;;;;;; shuffle +define(`shuffle1', ` +define <1 x $1> @__shuffle_$1(<1 x $1>, <1 x i32>) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %lane = extractelement <1 x i32> %1, i32 0 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shuffle1(i8) +shuffle1(i16) +shuffle1(i32) +shuffle1(i64) +shuffle1(float) +shuffle1(double) + +define(`shuffle2',` +define <1 x $1> @__shuffle2_$1(<1 x $1>, <1 x $1>, <1 x i32>) nounwind readnone alwaysinline +{ + %val1 = extractelement <1 x $1> %0, i32 0 + %val2 = extractelement <1 x $1> %1, i32 0 + + ;; fetch both values + %lane = extractelement <1 x i32> %2, i32 0 + %lane_mask = and i32 %lane, 31 + %ret1 = tail call $1 @__shfl_$1_nvptx($1 %val1, i32 %lane_mask); + %ret2 = tail call $1 @__shfl_$1_nvptx($1 %val2, i32 %lane_mask); + + ;; select the correct one + %c = icmp slt i32 %lane, 32 + %rets = select i1 %c, $1 %ret1, $1 %ret2 + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shuffle2(i8) +shuffle2(i16) +shuffle2(i32) +shuffle2(i64) +shuffle2(float) +shuffle2(double) + +define(`shift',` +define <1 x $1> @__shift_$1(<1 x $1>, i32) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %lane = call i32 @__program_index() + %src = add i32 %lane, %1 + %ret = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %src) + %c1 = icmp sge i32 %src, 0 + %c2 = icmp slt i32 %src, 32 + %c = and i1 %c1, %c2 + %rets = select i1 %c, $1 %ret, $1 zeroinitializer + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +shift(i8) +shift(i16) +shift(i32) +shift(i64) +shift(float) +shift(double) + +define(`rotate', ` +define <1 x $1> @__rotate_$1(<1 x $1>, i32) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %tid = call i32 @__program_index() + %src = add i32 %tid, %1 + %lane = and i32 %src, 31 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %lane) + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +rotate(i8) +rotate(i16) +rotate(i32) +rotate(i64) +rotate(float) +rotate(double) + +define(`broadcast', ` +define <1 x $1> @__broadcast_$1(<1 x $1>, i32) nounwind readnone alwaysinline +{ + %val = extractelement <1 x $1> %0, i32 0 + %rets = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1) + %retv = insertelement <1 x $1> undef, $1 %rets, i32 0 + ret <1 x $1> %retv +} +') +broadcast(i8) +broadcast(i16) +broadcast(i32) +broadcast(i64) +broadcast(float) +broadcast(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefix sum stuff + +define internal i32 @__shfl_scan_add_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, $2, 0; + @p add.u32 r0, r0, $3; + mov.u32 $0, r0; + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline + ret i32 %result; +} +define <1 x i32> @__exclusive_scan_add_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x i32> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, i32 %v0, i32 0 + + %s1 = tail call i32 @__shfl_scan_add_step_i32(i32 %v, i32 1); + %s2 = tail call i32 @__shfl_scan_add_step_i32(i32 %s1, i32 2); + %s3 = tail call i32 @__shfl_scan_add_step_i32(i32 %s2, i32 4); + %s4 = tail call i32 @__shfl_scan_add_step_i32(i32 %s3, i32 8); + %s5 = tail call i32 @__shfl_scan_add_step_i32(i32 %s4, i32 16); + %rets = sub i32 %s5, %v + %retv = insertelement <1 x i32> undef, i32 %rets, i32 0 + ret <1 x i32> %retv +} +;; +define internal i32 @__shfl_scan_or_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, $2, 0; + @p or.b32 r0, r0, $3; + mov.u32 $0, r0; + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline + ret i32 %result; +} +define <1 x i32> @__exclusive_scan_or_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x i32> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v1 = select i1 %mask, i32 %v0, i32 0 + + ;; shfl-up by one for exclusive scan + %v = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, 1, 0; + @!p mov.u32 r0, 0; + mov.u32 $0, r0; + }","=r,r"(i32 %v1); + + %s1 = tail call i32 @__shfl_scan_or_step_i32(i32 %v, i32 1); + %s2 = tail call i32 @__shfl_scan_or_step_i32(i32 %s1, i32 2); + %s3 = tail call i32 @__shfl_scan_or_step_i32(i32 %s2, i32 4); + %s4 = tail call i32 @__shfl_scan_or_step_i32(i32 %s3, i32 8); + %s5 = tail call i32 @__shfl_scan_or_step_i32(i32 %s4, i32 16); + %retv = insertelement <1 x i32> undef, i32 %s5, i32 0 + ret <1 x i32> %retv +} +;; +define internal i32 @__shfl_scan_and_step_i32(i32 %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, $2, 0; + @p and.b32 r0, r0, $3; + mov.u32 $0, r0; + }", "=r,r,r,r"(i32 %partial, i32 %up_offset, i32 %partial) nounwind readnone alwaysinline + ret i32 %result; +} +define <1 x i32> @__exclusive_scan_and_i32(<1 x i32>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x i32> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v1 = select i1 %mask, i32 %v0, i32 -1 + + ;; shfl-up by one for exclusive scan + %v = tail call i32 asm sideeffect + "{.reg .u32 r0; + .reg .pred p; + shfl.up.b32 r0|p, $1, 1, 0; + @!p mov.u32 r0, -1; + mov.u32 $0, r0; + }","=r,r"(i32 %v1); + + %s1 = tail call i32 @__shfl_scan_and_step_i32(i32 %v, i32 1); + %s2 = tail call i32 @__shfl_scan_and_step_i32(i32 %s1, i32 2); + %s3 = tail call i32 @__shfl_scan_and_step_i32(i32 %s2, i32 4); + %s4 = tail call i32 @__shfl_scan_and_step_i32(i32 %s3, i32 8); + %s5 = tail call i32 @__shfl_scan_and_step_i32(i32 %s4, i32 16); + %retv = insertelement <1 x i32> undef, i32 %s5, i32 0 + ret <1 x i32> %retv +} + +define internal float @__shfl_scan_add_step_float(float %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call float asm sideeffect + "{.reg .f32 f0; + .reg .pred p; + shfl.up.b32 f0|p, $1, $2, 0; + @p add.f32 f0, f0, $3; + mov.f32 $0, f0; + }", "=f,f,r,f"(float %partial, i32 %up_offset, float %partial) nounwind readnone alwaysinline + ret float %result; +} +define <1 x float> @__exclusive_scan_add_float(<1 x float>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x float> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, float %v0, float zeroinitializer + + %s1 = tail call float @__shfl_scan_add_step_float(float %v, i32 1); + %s2 = tail call float @__shfl_scan_add_step_float(float %s1, i32 2); + %s3 = tail call float @__shfl_scan_add_step_float(float %s2, i32 4); + %s4 = tail call float @__shfl_scan_add_step_float(float %s3, i32 8); + %s5 = tail call float @__shfl_scan_add_step_float(float %s4, i32 16); + %rets = fsub float %s5, %v + %retv = insertelement <1 x float> undef, float %rets, i32 0 + ret <1 x float> %retv +} +define internal double @__shfl_scan_add_step_double(double %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call double asm sideeffect + "{.reg .s32 r<10>; + .reg .f64 fd0; + .reg .pred p; + .reg .b32 temp; + mov.b64 {r1,temp}, $1; + mov.b64 {temp,r2}, $1; + shfl.up.b32 r3, r1, $2, 0; + shfl.up.b32 r4|p, r2, $2, 0; + mov.b64 fd0, {r3,r4}; + @p add.f64 fd0, fd0, $3; + mov.f64 $0, fd0; + }", "=d,d,r,d"(double %partial, i32 %up_offset, double %partial) nounwind readnone alwaysinline + ret double %result; +} +define <1 x double> @__exclusive_scan_add_double(<1 x double>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x double> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, double %v0, double zeroinitializer + + %s1 = tail call double @__shfl_scan_add_step_double(double %v, i32 1); + %s2 = tail call double @__shfl_scan_add_step_double(double %s1, i32 2); + %s3 = tail call double @__shfl_scan_add_step_double(double %s2, i32 4); + %s4 = tail call double @__shfl_scan_add_step_double(double %s3, i32 8); + %s5 = tail call double @__shfl_scan_add_step_double(double %s4, i32 16); + %rets = fsub double %s5, %v + %retv = bitcast double %rets to <1 x double> + ret <1 x double> %retv +} + +define internal i64 @__shfl_scan_add_step_i64(i64 %partial, i32 %up_offset) nounwind readnone alwaysinline +{ + %result = tail call i64 asm sideeffect + "{.reg .s32 r<10>; + .reg .s64 rl0; + .reg .pred p; + .reg .b32 temp; + mov.b64 {r1,temp}, $1; + mov.b64 {temp,r2}, $1; + shfl.up.b32 r3, r1, $2, 0; + shfl.up.b32 r4|p, r2, $2, 0; + mov.b64 rl0, {r3,r4}; + @p add.s64 rl0, rl0, $3; + mov.s64 $0, rl0; + }", "=l,l,r,l"(i64 %partial, i32 %up_offset, i64 %partial) nounwind readnone alwaysinline + ret i64 %result; +} +define <1 x i64> @__exclusive_scan_add_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +{ + %v0 = extractelement <1 x i64> %0, i32 0 + %mask = extractelement <1 x i1 > %1, i32 0 + %v = select i1 %mask, i64 %v0, i64 zeroinitializer + + %s1 = tail call i64 @__shfl_scan_add_step_i64(i64 %v, i32 1); + %s2 = tail call i64 @__shfl_scan_add_step_i64(i64 %s1, i32 2); + %s3 = tail call i64 @__shfl_scan_add_step_i64(i64 %s2, i32 4); + %s4 = tail call i64 @__shfl_scan_add_step_i64(i64 %s3, i32 8); + %s5 = tail call i64 @__shfl_scan_add_step_i64(i64 %s4, i32 16); + %rets = sub i64 %s5, %v + %retv = bitcast i64 %rets to <1 x i64> + ret <1 x i64> %retv +} + +define(`exclusive_scan_i64',` +define <1 x i64> @__exclusive_scan_$1_i64(<1 x i64>, <1 x i1>) nounwind readnone alwaysinline +{ + %v = bitcast <1 x i64> %0 to <2 x i32> + %v0 = extractelement <2 x i32> %v, i32 0 + %v1 = extractelement <2 x i32> %v, i32 1 + %inp0 = bitcast i32 %v0 to <1 x i32> + %inp1 = bitcast i32 %v1 to <1 x i32> + %res0 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp0, <1 x i1> %1); + %res1 = call <1 x i32> @__exclusive_scan_$1_i32(<1 x i32> %inp1, <1 x i1> %1); + %r0 = bitcast <1 x i32> %res0 to i32 + %r1 = bitcast <1 x i32> %res1 to i32 + %ret0 = insertelement <2 x i32> undef, i32 %r0, i32 0 + %ret1 = insertelement <2 x i32> %ret0, i32 %r1, i32 1 + %ret = bitcast <2 x i32> %ret1 to <1 x i64> + ret <1 x i64> %ret +} +') +exclusive_scan_i64(or) +exclusive_scan_i64(and) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unaligned loads/loads+broadcasts + + +masked_load(i8, 1) +masked_load(i16, 2) +masked_load(i32, 4) +masked_load(float, 4) +masked_load(i64, 8) +masked_load(double, 8) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store + +gen_masked_store(i8) +gen_masked_store(i16) +gen_masked_store(i32) +gen_masked_store(float) +gen_masked_store(i64) +gen_masked_store(double) + +define void @__masked_store_blend_i8(* nocapture, , + ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_i16(* nocapture, , + ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_i32(* nocapture, , + ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_float(* nocapture, , + ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_i64(* nocapture, + , ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +define void @__masked_store_blend_double(* nocapture, + , ) nounwind alwaysinline { + %v = load * %0 + %v1 = select %2, %1, %v + store %v1, * %0 + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather/scatter + +; define these with the macros from stdlib.m4 + +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) + +gen_scatter(i8) +gen_scatter(i16) +gen_scatter(i32) +gen_scatter(float) +gen_scatter(i64) +gen_scatter(double) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetch +define_prefetches() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define_avgs() + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector ops + +define(`extract_insert',` +define $1 @__extract_$2(<1 x $1>, i32) nounwind readnone alwaysinline { + %val = extractelement <1 x $1> %0, i32 0 + %extract = tail call $1 @__shfl_$1_nvptx($1 %val, i32 %1) + ret $1 %extract +} + +define <1 x $1> @__insert_$2(<1 x $1>, i32, + $1) nounwind readnone alwaysinline { + %orig = extractelement <1 x $1> %0, i32 0 + %lane = call i32 @__program_index() + %c = icmp eq i32 %lane, %1 + %val = select i1 %c, $1 %2, $1 %orig + %insert = insertelement <1 x $1> %0, $1 %val, i32 0 + ret <1 x $1> %insert +} +') + +extract_insert(i8, int8) +extract_insert(i16, int16) +extract_insert(i32, int32) +extract_insert(i64, int64) +extract_insert(float, float) +extract_insert(double, double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; assert + +declare void @__assertfail(i64,i64,i32,i64,i64) noreturn; +declare i32 @vprintf(i64,i64) +define i32 @__puts_nvptx(i8*) alwaysinline +{ + %str = ptrtoint i8* %0 to i64 + %parm = or i64 0, 0 + %call = call i32 @vprintf(i64 %str, i64 %parm) +;; %cr = alloca <3 x i8> +;; store <3 x i8> , <3 x i8>* %cr +;; %cr1 = ptrtoint <3 x i8>* %cr to i64 +;; %call1 = call i32 @vprintf(i64 %cr1, i64 %parm) + ret i32 %call; +} +define internal void @__abort_nvptx(i8* %str) noreturn +{ + %tmp1 = alloca <3 x i8> + store <3 x i8> , <3 x i8>* %tmp1 + %tmp2 = alloca <2 x i8> + store <2 x i8> , <2 x i8>* %tmp2 + + %param1 = ptrtoint <2 x i8>* %tmp2 to i64 + %param3 = or i32 0, 0 + %string = ptrtoint i8* %str to i64 + %param4 = ptrtoint <3 x i8>* %tmp1 to i64 + %param5 = or i64 1, 1 + call void @__assertfail(i64 %param1, i64 %string, i32 %param3, i64 %param4, i64 %param5); + ret void +} + +define void @__do_assert_uniform(i8 *%str, i1 %test, %mask) { + br i1 %test, label %ok, label %fail + +fail: + %lane = call i32 @__program_index() + %cmp = icmp eq i32 %lane, 0 + br i1 %cmp, label %fail_print, label %fail_void; + + + +fail_print: + call void @__abort_nvptx(i8* %str) noreturn + unreachable + +fail_void: + unreachable + +ok: + ret void +} + + +define void @__do_assert_varying(i8 *%str, %test, + %mask) { + %nottest = xor %test, + < forloop(i, 1, eval(WIDTH-1), `MASK -1, ') MASK -1 > + %nottest_and_mask = and %nottest, %mask + %mm = call i64 @__movmsk( %nottest_and_mask) + %all_ok = icmp eq i64 %mm, 0 + br i1 %all_ok, label %ok, label %fail + +fail: + call void @__abort_nvptx(i8* %str) noreturn + unreachable + +ok: + ret void +} + +define i64 @__clock() nounwind alwaysinline { + %r = call i64 asm sideeffect "mov.b64 $0, %clock64;", "=l"(); + ret i64 %r +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; atomics and memory barriers + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_associative +;; More efficient implementation for atomics that are associative (e.g., +;; add, and, ...). If a basic implementation would do sometihng like: +;; result0 = atomic_op(ptr, val0) +;; result1 = atomic_op(ptr, val1) +;; .. +;; Then instead we can do: +;; tmp = (val0 op val1 op ...) +;; result0 = atomic_op(ptr, tmp) +;; result1 = (result0 op val0) +;; .. +;; And more efficiently compute the same result +;; +;; Takes five parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) +;; $5: identity value for the operator (e.g. 0 for add, -1 for AND, ...) +;; add +define <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; sub +define <1 x i32> @__atomic_sub_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %nvalv = sub <1 x i32> , %valv + %ret = call <1 x i32> @__atomic_add_int32_global(i32* %ptr, <1 x i32> %nvalv, <1 x i1> %maskv); + ret <1 x i32> %ret; +} +;; and +define <1 x i32> @__atomic_and_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; or +define <1 x i32> @__atomic_or_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} +;; xor +define <1 x i32> @__atomic_xor_int32_global(i32* %ptr, <1 x i32> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i32> %valv to i32 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + %oldv = bitcast i32 %old to <1 x i32> + ret <1 x i32> %oldv +pass: + ret <1 x i32> %valv +} + +;;;;;;;;; int64 +define <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} +define <1 x i64> @__atomic_sub_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %nvalv = sub <1 x i64> , %valv + %ret = call <1 x i64> @__atomic_add_int64_global(i64* %ptr, <1 x i64> %nvalv, <1 x i1> %maskv); + ret <1 x i64> %ret; +} + +;; and +define <1 x i64> @__atomic_and_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %andr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %andr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;; or +define <1 x i64> @__atomic_or_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %orr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %orr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;; xor +define <1 x i64> @__atomic_xor_int64_global(i64* %ptr, <1 x i64> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x i64> %valv to i64 + br i1 %mask, label %exec, label %pass +exec: + %xorr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %xorr, i64 %val); + %oldv = bitcast i64 %old to <1 x i64> + ret <1 x i64> %oldv +pass: + ret <1 x i64> %valv +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; global_atomic_uniform +;; Defines the implementation of a function that handles the mapping from +;; an ispc atomic function to the underlying LLVM intrinsics. This variant +;; just calls the atomic once, for the given uniform value +;; +;; Takes four parameters: +;; $1: vector width of the target +;; $2: operation being performed (w.r.t. LLVM atomic intrinsic names) +;; (add, sub...) +;; $3: return type of the LLVM atomic (e.g. i32) +;; $4: return type of the LLVM atomic type, in ispc naming paralance (e.g. int32) + +define internal i32 @__get_first_active_lane() +{ + %nact = call i32 @__ballot_nvptx(i1 true); + %lane1 = call i32 @__count_leading_zeros_i32(i32 %nact) + %lane = sub i32 31, %lane1 + ret i32 %lane +} + +define internal i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.add.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_sub_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %nval = sub i32 0, %val; + %old = tail call i32 @__atomic_add_uniform_int32_global_nvptx(i32* %ptr, i32 %nval); + ret i32 %old; +} +define internal i32 @__atomic_and_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.and.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_or_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.or.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_xor_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.xor.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_min_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.min.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_max_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.max.s32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_umin_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.min.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i32 @__atomic_umax_uniform_uint32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.max.u32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} + + +define internal i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.add.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_sub_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %nval = sub i64 0, %val; + %old = tail call i64 @__atomic_add_uniform_int64_global_nvptx(i64* %ptr, i64 %nval); + ret i64 %old; +} +define internal i64 @__atomic_and_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.and.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_or_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.or.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_xor_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.xor.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_min_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.min.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_max_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.max.s64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_umin_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.min.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal i64 @__atomic_umax_uniform_uint64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.max.u64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} + +define(`global_atomic',` +define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %val = bitcast <1 x $3> %valv to $3 + br i1 %mask, label %exec, label %pass +exec: + %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val); + %oldv = bitcast $3 %old to <1 x $3> + ret <1 x $3> %oldv +pass: + ret <1 x $3> %valv +} +') +define(`global_atomic_uniform',` +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %val) nounwind alwaysinline +{ +entry: + %addr = ptrtoint $3 * %ptr to i64 + %active = call i32 @__get_first_active_lane(); + %lane = call i32 @__program_index(); + %c = icmp eq i32 %lane, %active + br i1 %c, label %p1, label %p2 + +p1: + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %val); + br label %p2; + +p2: + %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry] + %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active) + ret $3 %old; +} +') +define(`global_atomic_varying',` +define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %addr = bitcast <1 x i64> %ptr to i64 + %c = bitcast <1 x i1> %maskv to i1 + br i1 %c, label %p1, label %p2 + +p1: + %sv = bitcast <1 x $3> %val to $3 + %sptr = inttoptr i64 %addr to $3* + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sv); + %t0v = bitcast $3 %t0 to <1 x $3> + ret < 1x $3> %t0v + +p2: + ret <1 x $3> %val +} +') + + +global_atomic_uniform(1, add, i32, int32) +global_atomic_uniform(1, sub, i32, int32) +global_atomic_uniform(1, and, i32, int32) +global_atomic_uniform(1, or, i32, int32) +global_atomic_uniform(1, xor, i32, int32) +global_atomic_uniform(1, min, i32, int32) +global_atomic_uniform(1, max, i32, int32) +global_atomic_uniform(1, umin, i32, uint32) +global_atomic_uniform(1, umax, i32, uint32) + +global_atomic_uniform(1, add, i64, int64) +global_atomic_uniform(1, sub, i64, int64) +global_atomic_uniform(1, and, i64, int64) +global_atomic_uniform(1, or, i64, int64) +global_atomic_uniform(1, xor, i64, int64) +global_atomic_uniform(1, min, i64, int64) +global_atomic_uniform(1, max, i64, int64) +global_atomic_uniform(1, umin, i64, uint64) +global_atomic_uniform(1, umax, i64, uint64) + +global_atomic_varying(1, add, i32, int32) +global_atomic_varying(1, sub, i32, int32) +global_atomic_varying(1, and, i32, int32) +global_atomic_varying(1, or, i32, int32) +global_atomic_varying(1, xor, i32, int32) +global_atomic_varying(1, min, i32, int32) +global_atomic_varying(1, max, i32, int32) +global_atomic_varying(1, umin, i32, uint32) +global_atomic_varying(1, umax, i32, uint32) + +global_atomic_varying(1, add, i64, int64) +global_atomic_varying(1, sub, i64, int64) +global_atomic_varying(1, and, i64, int64) +global_atomic_varying(1, or, i64, int64) +global_atomic_varying(1, xor, i64, int64) +global_atomic_varying(1, min, i64, int64) +global_atomic_varying(1, max, i64, int64) +global_atomic_varying(1, umin, i64, uint64) +global_atomic_varying(1, umax, i64, uint64) + +;; Macro to declare the function that implements the swap atomic. +;; Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define internal i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptr, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.exch.b32 $0, [$1], $2;", "=r,l,r"(i64 %addr, i32 %val); + ret i32 %old; +} +define internal i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptr, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.exch.b64 $0, [$1], $2;", "=l,l,l"(i64 %addr, i64 %val); + ret i64 %old; +} +define internal float @__atomic_swap_uniform_float_global_nvptx(float* %ptr, float %val) nounwind alwaysinline +{ + %ptrI = bitcast float* %ptr to i32* + %valI = bitcast float %val to i32 + %retI = call i32 @__atomic_swap_uniform_int32_global_nvptx(i32* %ptrI, i32 %valI) + %ret = bitcast i32 %retI to float + ret float %ret +} +define internal double @__atomic_swap_uniform_double_global_nvptx(double* %ptr, double %val) nounwind alwaysinline +{ + %ptrI = bitcast double* %ptr to i64* + %valI = bitcast double %val to i64 + %retI = call i64 @__atomic_swap_uniform_int64_global_nvptx(i64* %ptrI, i64 %valI) + %ret = bitcast i64 %retI to double + ret double %ret +} +global_atomic_uniform(1, swap, i32, int32) +global_atomic_uniform(1, swap, i64, int64) +global_atomic_uniform(1, swap, float, float) +global_atomic_uniform(1, swap, double, double) +global_atomic_varying(1, swap, i32, int32) +global_atomic_varying(1, swap, i64, int64) +global_atomic_varying(1, swap, float, float) +global_atomic_varying(1, swap, double, double) + + +;; Similarly, macro to declare the function that implements the compare/exchange +;; atomic. Takes three parameters: +;; $1: vector width of the target +;; $2: llvm type of the vector elements (e.g. i32) +;; $3: ispc type of the elements (e.g. int32) + +define internal i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptr, i32 %cmp, i32 %val) nounwind alwaysinline +{ + %addr = ptrtoint i32* %ptr to i64 + %old = tail call i32 asm sideeffect "atom.cas.b32 $0, [$1], $2, $3;", "=r,l,r,r"(i64 %addr, i32 %cmp, i32 %val); + ret i32 %old; +} +define internal i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptr, i64 %cmp, i64 %val) nounwind alwaysinline +{ + %addr = ptrtoint i64* %ptr to i64 + %old = tail call i64 asm sideeffect "atom.cas.b64 $0, [$1], $2, $3;", "=l,l,l,l"(i64 %addr, i64 %cmp, i64 %val); + ret i64 %old; +} +define internal float @__atomic_compare_exchange_uniform_float_global_nvptx(float* %ptr, float %cmp, float %val) nounwind alwaysinline +{ + %ptrI = bitcast float* %ptr to i32* + %cmpI = bitcast float %cmp to i32 + %valI = bitcast float %val to i32 + %retI = call i32 @__atomic_compare_exchange_uniform_int32_global_nvptx(i32* %ptrI, i32 %cmpI, i32 %valI) + %ret = bitcast i32 %retI to float + ret float %ret +} +define internal double @__atomic_compare_exchange_uniform_double_global_nvptx(double* %ptr, double %cmp, double %val) nounwind alwaysinline +{ + %ptrI = bitcast double* %ptr to i64* + %cmpI = bitcast double %cmp to i64 + %valI = bitcast double %val to i64 + %retI = call i64 @__atomic_compare_exchange_uniform_int64_global_nvptx(i64* %ptrI, i64 %cmpI, i64 %valI) + %ret = bitcast i64 %retI to double + ret double %ret +} + +;;;;;;;;;;;; +define(`global_atomic_cas',` +define <1 x $3> @__atomic_$2_$4_global($3* %ptr, <1 x $3> %cmpv, <1 x $3> %valv, <1 x i1> %maskv) nounwind alwaysinline +{ + %mask = bitcast <1 x i1> %maskv to i1 + %cmp = bitcast <1 x $3> %cmpv to $3 + %val = bitcast <1 x $3> %valv to $3 + br i1 %mask, label %exec, label %pass +exec: + %old = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val); + %oldv = bitcast $3 %old to <1 x $3> + ret <1 x $3> %oldv +pass: + ret <1 x $3> %valv +} +') +define(`global_atomic_cas_uniform',` +define $3 @__atomic_$2_uniform_$4_global($3 * %ptr, $3 %cmp, $3 %val) nounwind alwaysinline +{ +entry: + %addr = ptrtoint $3 * %ptr to i64 + %active = call i32 @__get_first_active_lane(); + %lane = call i32 @__program_index(); + %c = icmp eq i32 %lane, %active + br i1 %c, label %p1, label %p2 + +p1: + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %ptr, $3 %cmp, $3 %val); + br label %p2; + +p2: + %t1 = phi $3 [%t0, %p1], [zeroinitializer, %entry] + %old = call $3 @__shfl_$3_nvptx($3 %t1, i32 %active) + ret $3 %old; +} +') +define(`global_atomic_cas_varying',` +define <1 x $3> @__atomic_$2_varying_$4_global(<1 x i64> %ptr, <1 x $3> %cmp, <1 x $3> %val, <1 x i1> %maskv) nounwind alwaysinline +{ +entry: + %addr = bitcast <1 x i64> %ptr to i64 + %c = bitcast <1 x i1> %maskv to i1 + br i1 %c, label %p1, label %p2 + +p1: + %sv = bitcast <1 x $3> %val to $3 + %sc = bitcast <1 x $3> %cmp to $3 + %sptr = inttoptr i64 %addr to $3* + %t0 = call $3 @__atomic_$2_uniform_$4_global_nvptx($3 * %sptr, $3 %sc, $3 %sv); + %t0v = bitcast $3 %t0 to <1 x $3> + ret < 1x $3> %t0v + +p2: + ret <1 x $3> %val +} +') + +global_atomic_cas_uniform(1, compare_exchange, i32, int32) +global_atomic_cas_uniform(1, compare_exchange, i64, int64) +global_atomic_cas_uniform(1, compare_exchange, float, float) +global_atomic_cas_uniform(1, compare_exchange, double, double) +global_atomic_cas_varying(1, compare_exchange, i32, int32) +global_atomic_cas_varying(1, compare_exchange, i64, int64) +global_atomic_cas_varying(1, compare_exchange, float, float) +global_atomic_cas_varying(1, compare_exchange, double, double) +global_atomic_cas(1, compare_exchange, i32, int32) +global_atomic_cas(1, compare_exchange, i64, int64) +global_atomic_cas(1, compare_exchange, float, float) +global_atomic_cas(1, compare_exchange, double, double) + + + + +declare void @llvm.nvvm.membar.gl() +declare void @llvm.nvvm.membar.sys() +declare void @llvm.nvvm.membar.cta() + +define void @__memory_barrier() nounwind readnone alwaysinline { + ;; see http://llvm.org/bugs/show_bug.cgi?id=2829. It seems like we + ;; only get an MFENCE on x86 if "device" is true, but IMHO we should + ;; in the case where the first 4 args are true but it is false. + ;; So we just always set that to true... + call void @llvm.nvvm.membar.gl() + ret void +} diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index ad1d88bc..b20fdfb4 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline { define_avgs() +declare_nvptx() diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 50dd0582..e1f9b2c8 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { %call = call i64 @llvm.ctpop.i64(i64 %0) ret i64 %call } + +declare_nvptx() diff --git a/builtins/util-nvptx.m4 b/builtins/util-nvptx.m4 new file mode 100644 index 00000000..19fcf68c --- /dev/null +++ b/builtins/util-nvptx.m4 @@ -0,0 +1,3417 @@ +;; Copyright (c) 2010-2013, Intel Corporation +;; All rights reserved. +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are +;; met: +;; +;; * Redistributions of source code must retain the above copyright +;; notice, this list of conditions and the following disclaimer. +;; +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; +;; * Neither the name of Intel Corporation nor the names of its +;; contributors may be used to endorse or promote products derived from +;; this software without specific prior written permission. +;; +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +;; IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +;; TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +;; PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +;; OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +;; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +;; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +;; This file provides a variety of macros used to generate LLVM bitcode +;; parametrized in various ways. Implementations of the standard library +;; builtins for various targets can use macros from this file to simplify +;; generating code for their implementations of those builtins. + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; It is a bit of a pain to compute this in m4 for 32 and 64-wide targets... +define(`ALL_ON_MASK', +`ifelse(WIDTH, `64', `-1', + WIDTH, `32', `4294967295', + `eval((1< $2, <8 x $1> undef, + <4 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, + <4 x i32> +') + +define(`v16tov8', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, + <8 x i32> +') + +define(`v4tov2', ` + $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> + $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> +') + +define(`v8tov2', ` + $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> + $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> +') + +define(`v16tov4', ` + $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> + $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; vector assembly: wider vector from two narrower vectors +;; +;; $1: vector element type +;; $2: first n-wide vector +;; $3: second n-wide vector +;; $4: result 2*n-wide vector +define(`v8tov16', ` + $4 = shufflevector <8 x $1> $2, <8 x $1> $3, + <16 x i32> +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;; Helper macro for calling various SSE instructions for scalar values +;; but where the instruction takes a vector parameter. +;; $1 : name of variable to put the final value in +;; $2 : vector width of the target +;; $3 : scalar type of the operand +;; $4 : SSE intrinsic name +;; $5 : variable name that has the scalar value +;; For example, the following call causes the variable %ret to have +;; the result of a call to sqrtss with the scalar value in %0 +;; sse_unary_scalar(ret, 4, float, @llvm.x86.sse.sqrt.ss, %0) + +define(`sse_unary_scalar', ` + %$1_vec = insertelement <$2 x $3> undef, $3 $5, i32 0 + %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_vec) + %$1 = extractelement <$2 x $3> %$1_val, i32 0 +') + +;; Similar to `sse_unary_scalar', this helper macro is for calling binary +;; SSE instructions with scalar values, +;; $1: name of variable to put the result in +;; $2: vector width of the target +;; $3: scalar type of the operand +;; $4 : SSE intrinsic name +;; $5 : variable name that has the first scalar operand +;; $6 : variable name that has the second scalar operand + +define(`sse_binary_scalar', ` + %$1_veca = insertelement <$2 x $3> undef, $3 $5, i32 0 + %$1_vecb = insertelement <$2 x $3> undef, $3 $6, i32 0 + %$1_val = call <$2 x $3> $4(<$2 x $3> %$1_veca, <$2 x $3> %$1_vecb) + %$1 = extractelement <$2 x $3> %$1_val, i32 0 +') + +;; Do a reduction over a 4-wide vector +;; $1: type of final scalar result +;; $2: 4-wide function that takes 2 4-wide operands and returns the +;; element-wise reduction +;; $3: scalar function that takes two scalar operands and returns +;; the final reduction + +define(`reduce4', ` + %v1 = shufflevector <4 x $1> %0, <4 x $1> undef, + <4 x i32> + %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %0) + %m1a = extractelement <4 x $1> %m1, i32 0 + %m1b = extractelement <4 x $1> %m1, i32 1 + %m = call $1 $3($1 %m1a, $1 %m1b) + ret $1 %m +' +) + +;; Similar to `reduce4', do a reduction over an 8-wide vector +;; $1: type of final scalar result +;; $2: 8-wide function that takes 2 8-wide operands and returns the +;; element-wise reduction +;; $3: scalar function that takes two scalar operands and returns +;; the final reduction + +define(`reduce8', ` + %v1 = shufflevector <8 x $1> %0, <8 x $1> undef, + <8 x i32> + %m1 = call <8 x $1> $2(<8 x $1> %v1, <8 x $1> %0) + %v2 = shufflevector <8 x $1> %m1, <8 x $1> undef, + <8 x i32> + %m2 = call <8 x $1> $2(<8 x $1> %v2, <8 x $1> %m1) + %m2a = extractelement <8 x $1> %m2, i32 0 + %m2b = extractelement <8 x $1> %m2, i32 1 + %m = call $1 $3($1 %m2a, $1 %m2b) + ret $1 %m +' +) + +define(`reduce16', ` + %v1 = shufflevector <16 x $1> %0, <16 x $1> undef, + <16 x i32> + %m1 = call <16 x $1> $2(<16 x $1> %v1, <16 x $1> %0) + %v2 = shufflevector <16 x $1> %m1, <16 x $1> undef, + <16 x i32> + %m2 = call <16 x $1> $2(<16 x $1> %v2, <16 x $1> %m1) + %v3 = shufflevector <16 x $1> %m2, <16 x $1> undef, + <16 x i32> + %m3 = call <16 x $1> $2(<16 x $1> %v3, <16 x $1> %m2) + + %m3a = extractelement <16 x $1> %m3, i32 0 + %m3b = extractelement <16 x $1> %m3, i32 1 + %m = call $1 $3($1 %m3a, $1 %m3b) + ret $1 %m +' +) + +;; Do an reduction over an 8-wide vector, using a vector reduction function +;; that only takes 4-wide vectors +;; $1: type of final scalar result +;; $2: 4-wide function that takes 2 4-wide operands and returns the +;; element-wise reduction +;; $3: scalar function that takes two scalar operands and returns +;; the final reduction + +define(`reduce8by4', ` + v8tov4($1, %0, %v1, %v2) + %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2) + %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef, + <4 x i32> + %m2 = call <4 x $1> $2(<4 x $1> %v3, <4 x $1> %m1) + %m2a = extractelement <4 x $1> %m2, i32 0 + %m2b = extractelement <4 x $1> %m2, i32 1 + %m = call $1 $3($1 %m2a, $1 %m2b) + ret $1 %m +' +) + + +;; Apply a unary function to the 4-vector in %0, return the vector result. +;; $1: scalar type of result +;; $2: name of scalar function to call + +define(`unary1to4', ` + %v_0 = extractelement <4 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0 + %v_1 = extractelement <4 x $1> %0, i32 1 + %r_1 = call $1 $2($1 %v_1) + %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1 + %v_2 = extractelement <4 x $1> %0, i32 2 + %r_2 = call $1 $2($1 %v_2) + %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2 + %v_3 = extractelement <4 x $1> %0, i32 3 + %r_3 = call $1 $2($1 %v_3) + %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3 + ret <4 x $1> %ret_3 +') + +define(`unary1to8', ` + %v_0 = extractelement <8 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <8 x $1> undef, $1 %r_0, i32 0 + %v_1 = extractelement <8 x $1> %0, i32 1 + %r_1 = call $1 $2($1 %v_1) + %ret_1 = insertelement <8 x $1> %ret_0, $1 %r_1, i32 1 + %v_2 = extractelement <8 x $1> %0, i32 2 + %r_2 = call $1 $2($1 %v_2) + %ret_2 = insertelement <8 x $1> %ret_1, $1 %r_2, i32 2 + %v_3 = extractelement <8 x $1> %0, i32 3 + %r_3 = call $1 $2($1 %v_3) + %ret_3 = insertelement <8 x $1> %ret_2, $1 %r_3, i32 3 + %v_4 = extractelement <8 x $1> %0, i32 4 + %r_4 = call $1 $2($1 %v_4) + %ret_4 = insertelement <8 x $1> %ret_3, $1 %r_4, i32 4 + %v_5 = extractelement <8 x $1> %0, i32 5 + %r_5 = call $1 $2($1 %v_5) + %ret_5 = insertelement <8 x $1> %ret_4, $1 %r_5, i32 5 + %v_6 = extractelement <8 x $1> %0, i32 6 + %r_6 = call $1 $2($1 %v_6) + %ret_6 = insertelement <8 x $1> %ret_5, $1 %r_6, i32 6 + %v_7 = extractelement <8 x $1> %0, i32 7 + %r_7 = call $1 $2($1 %v_7) + %ret_7 = insertelement <8 x $1> %ret_6, $1 %r_7, i32 7 + ret <8 x $1> %ret_7 +') + +;; Given a unary function that takes a 2-wide vector and a 4-wide vector +;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide +;; vector, apply it, and return the corresponding 4-wide vector result +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 2-wide unary vector function to apply +;; $4: 4-wide operand value + +define(`unary2to4', ` + %$1_0 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> +' +) + +;; Similar to `unary2to4', this applies a 2-wide binary function to two 4-wide +;; vector operands +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 2-wide binary vector function to apply +;; $4: First 4-wide operand value +;; $5: Second 4-wide operand value + +define(`binary2to4', ` +%$1_0a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> +%$1_0b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> +%v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) +%$1_1a = shufflevector <4 x $2> $4, <4 x $2> undef, <2 x i32> +%$1_1b = shufflevector <4 x $2> $5, <4 x $2> undef, <2 x i32> +%v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) +%$1 = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> +' +) + +;; Similar to `unary2to4', this maps a 4-wide unary function to an 8-wide +;; vector operand +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 4-wide unary vector function to apply +;; $4: 8-wide operand value + +define(`unary4to8', ` + %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> +' +) + +;; $1: name of variable into which the final result should go +;; $2: scalar type of the input vector elements +;; $3: scalar type of the result vector elements +;; $4: 4-wide unary vector function to apply +;; $5: 8-wide operand value + +define(`unary4to8conv', ` + %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> +' +) + +define(`unary4to16', ` + %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0) + %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1) + %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2) + %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> + %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3) + + %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, + <8 x i32> + %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b, + <16 x i32> +' +) + +define(`unary4to16conv', ` + %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> + %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3) + + %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, + <8 x i32> + %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, + <8 x i32> + %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b, + <16 x i32> +' +) + +;; And so forth... +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 8-wide unary vector function to apply +;; $4: 16-wide operand value + +define(`unary8to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, + <8 x i32> + %v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, + <8 x i32> + %v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1) + %$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, + <16 x i32> +' +) + +;; And along the lines of `binary2to4', this maps a 4-wide binary function to +;; two 8-wide vector operands +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 4-wide unary vector function to apply +;; $4: First 8-wide operand value +;; $5: Second 8-wide operand value + +define(`binary4to8', ` +%$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> +%$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> +%v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) +%$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> +%$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> +%v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) +%$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, + <8 x i32> +' +) + +define(`binary8to16', ` +%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, + <8 x i32> +%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, + <8 x i32> +%v$1_0 = call <8 x $2> $3(<8 x $2> %$1_0a, <8 x $2> %$1_0b) +%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, + <8 x i32> +%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, + <8 x i32> +%v$1_1 = call <8 x $2> $3(<8 x $2> %$1_1a, <8 x $2> %$1_1b) +%$1 = shufflevector <8 x $2> %v$1_0, <8 x $2> %v$1_1, + <16 x i32> +' +) + +define(`binary4to16', ` +%$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, + <4 x i32> +%$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, + <4 x i32> +%r$1_0 = call <4 x $2> $3(<4 x $2> %$1_0a, <4 x $2> %$1_0b) + +%$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, + <4 x i32> +%$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, + <4 x i32> +%r$1_1 = call <4 x $2> $3(<4 x $2> %$1_1a, <4 x $2> %$1_1b) + +%$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, + <4 x i32> +%$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, + <4 x i32> +%r$1_2 = call <4 x $2> $3(<4 x $2> %$1_2a, <4 x $2> %$1_2b) + +%$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, + <4 x i32> +%$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, + <4 x i32> +%r$1_3 = call <4 x $2> $3(<4 x $2> %$1_3a, <4 x $2> %$1_3b) + +%r$1_01 = shufflevector <4 x $2> %r$1_0, <4 x $2> %r$1_1, + <8 x i32> +%r$1_23 = shufflevector <4 x $2> %r$1_2, <4 x $2> %r$1_3, + <8 x i32> + +%$1 = shufflevector <8 x $2> %r$1_01, <8 x $2> %r$1_23, + <16 x i32> +') + +;; Maps a 2-wide unary function to an 8-wide vector operand, returning an +;; 8-wide vector result +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 2-wide unary vector function to apply +;; $4: 8-wide operand value + +define(`unary2to8', ` + %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`unary2to16', ` + %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0) + %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1) + %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2) + %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3) + %$1_4 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4) + %$1_5 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5) + %$1_6 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6) + %$1_7 = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7) + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + +;; Maps an 2-wide binary function to two 8-wide vector operands +;; $1: name of variable into which the final result should go +;; $2: scalar type of the vector elements +;; $3: 2-wide unary vector function to apply +;; $4: First 8-wide operand value +;; $5: Second 8-wide operand value + +define(`binary2to8', ` + %$1_0a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %$1_0b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %$1_1b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %$1_2b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <8 x $2> $4, <8 x $2> undef, <2 x i32> + %$1_3b = shufflevector <8 x $2> $5, <8 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1 = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> +' +) + +define(`binary2to16', ` + %$1_0a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_0b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_0 = call <2 x $2> $3(<2 x $2> %$1_0a, <2 x $2> %$1_0b) + %$1_1a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_1b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_1 = call <2 x $2> $3(<2 x $2> %$1_1a, <2 x $2> %$1_1b) + %$1_2a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_2b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_2 = call <2 x $2> $3(<2 x $2> %$1_2a, <2 x $2> %$1_2b) + %$1_3a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_3b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_3 = call <2 x $2> $3(<2 x $2> %$1_3a, <2 x $2> %$1_3b) + %$1_4a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_4b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_4 = call <2 x $2> $3(<2 x $2> %$1_4a, <2 x $2> %$1_4b) + %$1_5a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_5b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_5 = call <2 x $2> $3(<2 x $2> %$1_5a, <2 x $2> %$1_5b) + %$1_6a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_6b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_6 = call <2 x $2> $3(<2 x $2> %$1_6a, <2 x $2> %$1_6b) + %$1_7a = shufflevector <16 x $2> $4, <16 x $2> undef, <2 x i32> + %$1_7b = shufflevector <16 x $2> $5, <16 x $2> undef, <2 x i32> + %v$1_7 = call <2 x $2> $3(<2 x $2> %$1_7a, <2 x $2> %$1_7b) + + %$1a = shufflevector <2 x $2> %v$1_0, <2 x $2> %v$1_1, + <4 x i32> + %$1b = shufflevector <2 x $2> %v$1_2, <2 x $2> %v$1_3, + <4 x i32> + %$1ab = shufflevector <4 x $2> %$1a, <4 x $2> %$1b, + <8 x i32> + + %$1c = shufflevector <2 x $2> %v$1_4, <2 x $2> %v$1_5, + <4 x i32> + %$1d = shufflevector <2 x $2> %v$1_6, <2 x $2> %v$1_7, + <4 x i32> + %$1cd = shufflevector <4 x $2> %$1c, <4 x $2> %$1d, + <8 x i32> + + %$1 = shufflevector <8 x $2> %$1ab, <8 x $2> %$1cd, + <16 x i32> +' +) + +;; The unary SSE round intrinsic takes a second argument that encodes the +;; rounding mode. This macro makes it easier to apply the 4-wide roundps +;; to 8-wide vector operands +;; $1: value to be rounded +;; $2: integer encoding of rounding mode +;; FIXME: this just has a ret statement at the end to return the result, +;; which is inconsistent with the macros above + +define(`round4to8', ` +%v0 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> +%v1 = shufflevector <8 x float> $1, <8 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%ret = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +ret <8 x float> %ret +' +) + +define(`round4to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v2 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%v3 = shufflevector <16 x float> $1, <16 x float> undef, <4 x i32> +%r0 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v0, i32 $2) +%r1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v1, i32 $2) +%r2 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v2, i32 $2) +%r3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %v3, i32 $2) +%ret01 = shufflevector <4 x float> %r0, <4 x float> %r1, + <8 x i32> +%ret23 = shufflevector <4 x float> %r2, <4 x float> %r3, + <8 x i32> +%ret = shufflevector <8 x float> %ret01, <8 x float> %ret23, + <16 x i32> +ret <16 x float> %ret +' +) + +define(`round8to16', ` +%v0 = shufflevector <16 x float> $1, <16 x float> undef, + <8 x i32> +%v1 = shufflevector <16 x float> $1, <16 x float> undef, + <8 x i32> +%r0 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v0, i32 $2) +%r1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %v1, i32 $2) +%ret = shufflevector <8 x float> %r0, <8 x float> %r1, + <16 x i32> +ret <16 x float> %ret +' +) + +define(`round4to8double', ` +%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> +%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> +%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) +%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) +%ret = shufflevector <4 x double> %r0, <4 x double> %r1, + <8 x i32> +ret <8 x double> %ret +' +) + +; and similarly for doubles... + +define(`round2to4double', ` +%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> +%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> +%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) +%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) +%ret = shufflevector <2 x double> %r0, <2 x double> %r1, + <4 x i32> +ret <4 x double> %ret +' +) + +define(`round2to8double', ` +%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) +%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) +%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2) +%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2) +%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, + <4 x i32> +%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, + <4 x i32> +%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1, + <8 x i32> +ret <8 x double> %ret +' +) + +define(`round4to16double', ` +%v0 = shufflevector <16 x double> $1, <16 x double> undef, + <4 x i32> +%v1 = shufflevector <16 x double> $1, <16 x double> undef, + <4 x i32> +%v2 = shufflevector <16 x double> $1, <16 x double> undef, + <4 x i32> +%v3 = shufflevector <16 x double> $1, <16 x double> undef, + <4 x i32> +%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) +%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) +%r2 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v2, i32 $2) +%r3 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v3, i32 $2) +%ret0 = shufflevector <4 x double> %r0, <4 x double> %r1, + <8 x i32> +%ret1 = shufflevector <4 x double> %r2, <4 x double> %r3, + <8 x i32> +%ret = shufflevector <8 x double> %ret0, <8 x double> %ret1, + <16 x i32> +ret <16 x double> %ret +' +) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; forloop macro + +divert(`-1') +# forloop(var, from, to, stmt) - improved version: +# works even if VAR is not a strict macro name +# performs sanity check that FROM is larger than TO +# allows complex numerical expressions in TO and FROM +define(`forloop', `ifelse(eval(`($3) >= ($2)'), `1', + `pushdef(`$1', eval(`$2'))_$0(`$1', + eval(`$3'), `$4')popdef(`$1')')') +define(`_forloop', + `$3`'ifelse(indir(`$1'), `$2', `', + `define(`$1', incr(indir(`$1')))$0($@)')') +divert`'dnl + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; stdlib_core +;; +;; This macro defines a bunch of helper routines that depend on the +;; target's vector width +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define(`shuffles', ` +') + +define(`define_shuffles',` +shuffles(i8, 1) +shuffles(i16, 2) +shuffles(float, 4) +shuffles(i32, 4) +shuffles(double, 8) +shuffles(i64, 8) +') + + +define(`mask_converts', ` +define internal <$1 x i8> @convertmask_i1_i8_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i1_i16_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i1_i32_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i1_i64_$1(<$1 x i1>) { + %r = sext <$1 x i1> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i8_i8_$1(<$1 x i8>) { + ret <$1 x i8> %0 +} +define internal <$1 x i16> @convertmask_i8_i86_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i8_i32_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i8_i64_$1(<$1 x i8>) { + %r = sext <$1 x i8> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i16_i8_$1(<$1 x i16>) { + %r = trunc <$1 x i16> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i16_i16_$1(<$1 x i16>) { + ret <$1 x i16> %0 +} +define internal <$1 x i32> @convertmask_i16_i32_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i16_i64_$1(<$1 x i16>) { + %r = sext <$1 x i16> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i32_i8_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i32_i16_$1(<$1 x i32>) { + %r = trunc <$1 x i32> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i32_i32_$1(<$1 x i32>) { + ret <$1 x i32> %0 +} +define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) { + %r = sext <$1 x i32> %0 to <$1 x i64> + ret <$1 x i64> %r +} + +define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i8> + ret <$1 x i8> %r +} +define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i16> + ret <$1 x i16> %r +} +define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) { + %r = trunc <$1 x i64> %0 to <$1 x i32> + ret <$1 x i32> %r +} +define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) { + ret <$1 x i64> %0 +} +') + +mask_converts(WIDTH) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; count trailing zeros + +define(`ctlztz', ` +declare_count_zeros() + +define i32 @__count_trailing_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.cttz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_trailing_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.cttz.i64(i64 %0) + ret i64 %c +} + +define i32 @__count_leading_zeros_i32(i32) nounwind readnone alwaysinline { + %c = call i32 @llvm.ctlz.i32(i32 %0) + ret i32 %c +} + +define i64 @__count_leading_zeros_i64(i64) nounwind readnone alwaysinline { + %c = call i64 @llvm.ctlz.i64(i64 %0) + ret i64 %c +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; prefetching + +define(`define_prefetches', ` +declare void @llvm.prefetch(i8* nocapture %ptr, i32 %readwrite, i32 %locality, + i32 %cachetype) ; cachetype == 1 is dcache + +define void @__prefetch_read_uniform_1(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 3, i32 1) + ret void +} + +define void @__prefetch_read_uniform_2(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 2, i32 1) + ret void +} + +define void @__prefetch_read_uniform_3(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 1, i32 1) + ret void +} + +define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { + call void @llvm.prefetch(i8 * %0, i32 0, i32 0, i32 1) + ret void +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; AOS/SOA conversion primitives + +;; take 4 4-wide vectors laid out like ... +;; and reorder them to ... + +define(`aossoa', ` +declare void +@__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> %v3, <4 x float> * noalias %out0, + <4 x float> * noalias %out1, <4 x float> * noalias %out2, + <4 x float> * noalias %out3) nounwind alwaysinline ; + +;; Do the reverse of __aos_to_soa4_float4--reorder .. +;; to ... +;; This is the exact same set of operations that __soa_to_soa4_float4 does +;; (a 4x4 transpose), so just call that... + +declare void +@__soa_to_aos4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> %v3, <4 x float> * noalias %out0, + <4 x float> * noalias %out1, <4 x float> * noalias %out2, + <4 x float> * noalias %out3) nounwind alwaysinline; + +;; Convert 3-wide AOS values to SOA--specifically, given 3 4-vectors +;; , transpose to +;; . + +declare void +@__aos_to_soa3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> * noalias %out0, <4 x float> * noalias %out1, + <4 x float> * noalias %out2) nounwind alwaysinline +;; The inverse of __aos_to_soa3_float4: convert 3 4-vectors +;; to +;; . + +declare void +@__soa_to_aos3_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, + <4 x float> * noalias %out0, <4 x float> * noalias %out1, + <4 x float> * noalias %out2) nounwind alwaysinline +;; 8-wide +;; These functions implement the 8-wide variants of the AOS/SOA conversion +;; routines above. These implementations are all built on top of the 4-wide +;; vector versions. + +declare void +@__aos_to_soa4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> %v3, <8 x float> * noalias %out0, + <8 x float> * noalias %out1, <8 x float> * noalias %out2, + <8 x float> * noalias %out3) nounwind alwaysinline + +declare void +@__soa_to_aos4_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> %v3, <8 x float> * noalias %out0, + <8 x float> * noalias %out1, <8 x float> * noalias %out2, + <8 x float> * noalias %out3) nounwind alwaysinline + +declare void +@__aos_to_soa3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> * noalias %out0, <8 x float> * noalias %out1, + <8 x float> * noalias %out2) nounwind alwaysinline ; + + +declare void +@__soa_to_aos3_float8(<8 x float> %v0, <8 x float> %v1, <8 x float> %v2, + <8 x float> * noalias %out0, <8 x float> * noalias %out1, + <8 x float> * noalias %out2) nounwind alwaysinline ; + +;; 16-wide + +declare void +@__aos_to_soa4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> %v3, <16 x float> * noalias %out0, + <16 x float> * noalias %out1, <16 x float> * noalias %out2, + <16 x float> * noalias %out3) nounwind alwaysinline ; + + +declare void +@__soa_to_aos4_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> %v3, <16 x float> * noalias %out0, + <16 x float> * noalias %out1, <16 x float> * noalias %out2, + <16 x float> * noalias %out3) nounwind alwaysinline ; + +declare void +@__aos_to_soa3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> * noalias %out0, <16 x float> * noalias %out1, + <16 x float> * noalias %out2) nounwind alwaysinline ; + +declare void +@__soa_to_aos3_float16(<16 x float> %v0, <16 x float> %v1, <16 x float> %v2, + <16 x float> * noalias %out0, <16 x float> * noalias %out1, + <16 x float> * noalias %out2) nounwind alwaysinline ; + +;; versions to be called from stdlib + +declare void +@__aos_to_soa4_float(float * noalias %p, + * noalias %out0, * noalias %out1, + * noalias %out2, * noalias %out3) + nounwind alwaysinline ; + + +declare void +@__soa_to_aos4_float( %v0, %v1, %v2, + %v3, float * noalias %p) nounwind alwaysinline ; + + +declare void +@__aos_to_soa3_float(float * noalias %p, + * %out0, * %out1, + * %out2) nounwind alwaysinline ; + + +declare void +@__soa_to_aos3_float( %v0, %v1, %v2, + float * noalias %p) nounwind alwaysinline ; +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define(`masked_load_float_double', ` +define @__masked_load_float(i8 * %ptr, + %mask) readonly alwaysinline { + %v32 = call @__masked_load_i32(i8 * %ptr, %mask) + %vf = bitcast %v32 to + ret %vf +} + +define @__masked_load_double(i8 * %ptr, + %mask) readonly alwaysinline { + %v64 = call @__masked_load_i64(i8 * %ptr, %mask) + %vd = bitcast %v64 to + ret %vd +} + +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define(`masked_store_float_double', ` +define void @__masked_store_float( * nocapture, , + ) nounwind alwaysinline { + %ptr = bitcast * %0 to * + %val = bitcast %1 to + call void @__masked_store_i32( * %ptr, %val, %2) + ret void +} + + +define void @__masked_store_double( * nocapture, , + ) nounwind alwaysinline { + %ptr = bitcast * %0 to * + %val = bitcast %1 to + call void @__masked_store_i64( * %ptr, %val, %2) + ret void +} + +define void @__masked_store_blend_float( * nocapture, , + ) nounwind alwaysinline { + %ptr = bitcast * %0 to * + %val = bitcast %1 to + call void @__masked_store_blend_i32( * %ptr, %val, %2) + ret void +} + + +define void @__masked_store_blend_double( * nocapture, , + ) nounwind alwaysinline { + %ptr = bitcast * %0 to * + %val = bitcast %1 to + call void @__masked_store_blend_i64( * %ptr, %val, %2) + ret void +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + + +define(`stdlib_core', ` + +declare i32 @__fast_masked_vload() + +declare void @ISPCInstrument(i8*, i8*, i32, i64) nounwind + +declare i1 @__is_compile_time_constant_mask( %mask) +declare i1 @__is_compile_time_constant_uniform_int32(i32) +declare i1 @__is_compile_time_constant_varying_int32() + +; This function declares placeholder masked store functions for the +; front-end to use. +; +; void __pseudo_masked_store_i8 (uniform int8 *ptr, varying int8 values, mask) +; void __pseudo_masked_store_i16(uniform int16 *ptr, varying int16 values, mask) +; void __pseudo_masked_store_i32(uniform int32 *ptr, varying int32 values, mask) +; void __pseudo_masked_store_float(uniform float *ptr, varying float values, mask) +; void __pseudo_masked_store_i64(uniform int64 *ptr, varying int64 values, mask) +; void __pseudo_masked_store_double(uniform double *ptr, varying double values, mask) +; +; These in turn are converted to native masked stores or to regular +; stores (if the mask is all on) by the MaskedStoreOptPass optimization +; pass. + +declare void @__pseudo_masked_store_i8( * nocapture, , ) +declare void @__pseudo_masked_store_i16( * nocapture, , ) +declare void @__pseudo_masked_store_i32( * nocapture, , ) +declare void @__pseudo_masked_store_float( * nocapture, , ) +declare void @__pseudo_masked_store_i64( * nocapture, , ) +declare void @__pseudo_masked_store_double( * nocapture, , ) + +; Declare the pseudo-gather functions. When the ispc front-end needs +; to perform a gather, it generates a call to one of these functions, +; which ideally have these signatures: +; +; varying int8 __pseudo_gather_i8(varying int8 *, mask) +; varying int16 __pseudo_gather_i16(varying int16 *, mask) +; varying int32 __pseudo_gather_i32(varying int32 *, mask) +; varying float __pseudo_gather_float(varying float *, mask) +; varying int64 __pseudo_gather_i64(varying int64 *, mask) +; varying double __pseudo_gather_double(varying double *, mask) +; +; However, vectors of pointers weren not legal in LLVM until recently, so +; instead, it emits calls to functions that either take vectors of int32s +; or int64s, depending on the compilation target. + +declare @__pseudo_gather32_i8(, ) nounwind readonly +declare @__pseudo_gather32_i16(, ) nounwind readonly +declare @__pseudo_gather32_i32(, ) nounwind readonly +declare @__pseudo_gather32_float(, ) nounwind readonly +declare @__pseudo_gather32_i64(, ) nounwind readonly +declare @__pseudo_gather32_double(, ) nounwind readonly + +declare @__pseudo_gather64_i8(, ) nounwind readonly +declare @__pseudo_gather64_i16(, ) nounwind readonly +declare @__pseudo_gather64_i32(, ) nounwind readonly +declare @__pseudo_gather64_float(, ) nounwind readonly +declare @__pseudo_gather64_i64(, ) nounwind readonly +declare @__pseudo_gather64_double(, ) nounwind readonly + +; The ImproveMemoryOps optimization pass finds these calls and then +; tries to convert them to be calls to gather functions that take a uniform +; base pointer and then a varying integer offset, when possible. +; +; For targets without a native gather instruction, it is best to factor the +; integer offsets like "{1/2/4/8} * varying_offset + constant_offset", +; where varying_offset includes non-compile time constant values, and +; constant_offset includes compile-time constant values. (The scalar loads +; generated in turn can then take advantage of the free offsetting and scale by +; 1/2/4/8 that is offered by the x86 addresisng modes.) +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; int{32,64} offsets, uniform int32 offset_scale, +; int{32,64} offset_delta, mask) +; +; For targets with a gather instruction, it is better to just factor them into +; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the +; offsets are int32/64 vectors. +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; uniform int32 offset_scale, int{32,64} offsets, mask) + + +declare +@__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets32_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_double(i8 *, i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets64_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_double(i8 *, i32, , + ) nounwind readonly + +; Similarly to the pseudo-gathers defined above, we also declare undefined +; pseudo-scatter instructions with signatures: +; +; void __pseudo_scatter_i8 (varying int8 *, varying int8 values, mask) +; void __pseudo_scatter_i16(varying int16 *, varying int16 values, mask) +; void __pseudo_scatter_i32(varying int32 *, varying int32 values, mask) +; void __pseudo_scatter_float(varying float *, varying float values, mask) +; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask) +; void __pseudo_scatter_double(varying double *, varying double values, mask) +; + +declare void @__pseudo_scatter32_i8(, , ) nounwind +declare void @__pseudo_scatter32_i16(, , ) nounwind +declare void @__pseudo_scatter32_i32(, , ) nounwind +declare void @__pseudo_scatter32_float(, , ) nounwind +declare void @__pseudo_scatter32_i64(, , ) nounwind +declare void @__pseudo_scatter32_double(, , ) nounwind + +declare void @__pseudo_scatter64_i8(, , ) nounwind +declare void @__pseudo_scatter64_i16(, , ) nounwind +declare void @__pseudo_scatter64_i32(, , ) nounwind +declare void @__pseudo_scatter64_float(, , ) nounwind +declare void @__pseudo_scatter64_i64(, , ) nounwind +declare void @__pseudo_scatter64_double(, , ) nounwind + +; And the ImproveMemoryOps optimization pass also finds these and +; either transforms them to scatters like: +; +; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, +; varying int32 offsets, uniform int32 offset_scale, +; varying int{32,64} offset_delta, varying int8 values, mask) +; (and similarly for 16/32/64 bit values) +; +; Or, if the target has a native scatter instruction: +; +; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, +; uniform int32 offset_scale, varying int{32,64} offsets, +; varying int8 values, mask) +; (and similarly for 16/32/64 bit values) + +declare void +@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , + , ) nounwind + +declare float @__log_uniform_float(float) nounwind readnone +declare @__log_varying_float() nounwind readnone +declare float @__exp_uniform_float(float) nounwind readnone +declare @__exp_varying_float() nounwind readnone +declare float @__pow_uniform_float(float, float) nounwind readnone +declare @__pow_varying_float(, ) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +declare void @__use8() +declare void @__use16() +declare void @__use32() +declare void @__usefloat() +declare void @__use64() +declare void @__usedouble() + +;; This is a temporary function that will be removed at the end of +;; compilation--the idea is that it calls out to all of the various +;; functions / pseudo-function declarations that we need to keep around +;; so that they are available to the various optimization passes. This +;; then prevents those functions from being removed as dead code when +;; we do early DCE... + +define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, + %v32, %v64, + %mask) { + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; loads + %ml8 = call @__masked_load_i8(i8 * %ptr, %mask) + call void @__use8( %ml8) + %ml16 = call @__masked_load_i16(i8 * %ptr, %mask) + call void @__use16( %ml16) + %ml32 = call @__masked_load_i32(i8 * %ptr, %mask) + call void @__use32( %ml32) + %mlf = call @__masked_load_float(i8 * %ptr, %mask) + call void @__usefloat( %mlf) + %ml64 = call @__masked_load_i64(i8 * %ptr, %mask) + call void @__use64( %ml64) + %mld = call @__masked_load_double(i8 * %ptr, %mask) + call void @__usedouble( %mld) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; stores + %pv8 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_i8( * %pv8, %v8, + %mask) + %pv16 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_i16( * %pv16, %v16, + %mask) + %pv32 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_i32( * %pv32, %v32, + %mask) + %vf = bitcast %v32 to + %pvf = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_float( * %pvf, %vf, + %mask) + %pv64 = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_i64( * %pv64, %v64, + %mask) + %vd = bitcast %v64 to + %pvd = bitcast i8 * %ptr to * + call void @__pseudo_masked_store_double( * %pvd, %vd, + %mask) + + call void @__masked_store_i8( * %pv8, %v8, %mask) + call void @__masked_store_i16( * %pv16, %v16, %mask) + call void @__masked_store_i32( * %pv32, %v32, %mask) + call void @__masked_store_float( * %pvf, %vf, %mask) + call void @__masked_store_i64( * %pv64, %v64, %mask) + call void @__masked_store_double( * %pvd, %vd, %mask) + + call void @__masked_store_blend_i8( * %pv8, %v8, + %mask) + call void @__masked_store_blend_i16( * %pv16, %v16, + %mask) + call void @__masked_store_blend_i32( * %pv32, %v32, + %mask) + call void @__masked_store_blend_float( * %pvf, %vf, + %mask) + call void @__masked_store_blend_i64( * %pv64, %v64, + %mask) + call void @__masked_store_blend_double( * %pvd, %vd, + %mask) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; gathers + + %pg32_8 = call @__pseudo_gather32_i8( %v32, + %mask) + call void @__use8( %pg32_8) + %pg32_16 = call @__pseudo_gather32_i16( %v32, + %mask) + call void @__use16( %pg32_16) + %pg32_32 = call @__pseudo_gather32_i32( %v32, + %mask) + call void @__use32( %pg32_32) + %pg32_f = call @__pseudo_gather32_float( %v32, + %mask) + call void @__usefloat( %pg32_f) + %pg32_64 = call @__pseudo_gather32_i64( %v32, + %mask) + call void @__use64( %pg32_64) + %pg32_d = call @__pseudo_gather32_double( %v32, + %mask) + call void @__usedouble( %pg32_d) + + %pg64_8 = call @__pseudo_gather64_i8( %v64, + %mask) + call void @__use8( %pg64_8) + %pg64_16 = call @__pseudo_gather64_i16( %v64, + %mask) + call void @__use16( %pg64_16) + %pg64_32 = call @__pseudo_gather64_i32( %v64, + %mask) + call void @__use32( %pg64_32) + %pg64_f = call @__pseudo_gather64_float( %v64, + %mask) + call void @__usefloat( %pg64_f) + %pg64_64 = call @__pseudo_gather64_i64( %v64, + %mask) + call void @__use64( %pg64_64) + %pg64_d = call @__pseudo_gather64_double( %v64, + %mask) + call void @__usedouble( %pg64_d) + + %g32_8 = call @__gather32_i8( %v32, + %mask) + call void @__use8( %g32_8) + %g32_16 = call @__gather32_i16( %v32, + %mask) + call void @__use16( %g32_16) + %g32_32 = call @__gather32_i32( %v32, + %mask) + call void @__use32( %g32_32) + %g32_f = call @__gather32_float( %v32, + %mask) + call void @__usefloat( %g32_f) + %g32_64 = call @__gather32_i64( %v32, + %mask) + call void @__use64( %g32_64) + %g32_d = call @__gather32_double( %v32, + %mask) + call void @__usedouble( %g32_d) + + %g64_8 = call @__gather64_i8( %v64, + %mask) + call void @__use8( %g64_8) + %g64_16 = call @__gather64_i16( %v64, + %mask) + call void @__use16( %g64_16) + %g64_32 = call @__gather64_i32( %v64, + %mask) + call void @__use32( %g64_32) + %g64_f = call @__gather64_float( %v64, + %mask) + call void @__usefloat( %g64_f) + %g64_64 = call @__gather64_i64( %v64, + %mask) + call void @__use64( %g64_64) + %g64_d = call @__gather64_double( %v64, + %mask) + call void @__usedouble( %g64_d) + +ifelse(HAVE_GATHER, `1', +` + %nfpgbo32_8 = call + @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfpgbo32_8) + %nfpgbo32_16 = call + @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfpgbo32_16) + %nfpgbo32_32 = call + @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfpgbo32_32) + %nfpgbo32_f = call + @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfpgbo32_f) + %nfpgbo32_64 = call + @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfpgbo32_64) + %nfpgbo32_d = call + @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfpgbo32_d) + + %nfpgbo64_8 = call + @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfpgbo64_8) + %nfpgbo64_16 = call + @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfpgbo64_16) + %nfpgbo64_32 = call + @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfpgbo64_32) + %nfpgbo64_f = call + @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfpgbo64_f) + %nfpgbo64_64 = call + @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfpgbo64_64) + %nfpgbo64_d = call + @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfpgbo64_d) + + %nfgbo32_8 = call + @__gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfgbo32_8) + %nfgbo32_16 = call + @__gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfgbo32_16) + %nfgbo32_32 = call + @__gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfgbo32_32) + %nfgbo32_f = call + @__gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfgbo32_f) + %nfgbo32_64 = call + @__gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfgbo32_64) + %nfgbo32_d = call + @__gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfgbo32_d) + + %nfgbo64_8 = call + @__gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfgbo64_8) + %nfgbo64_16 = call + @__gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfgbo64_16) + %nfgbo64_32 = call + @__gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfgbo64_32) + %nfgbo64_f = call + @__gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfgbo64_f) + %nfgbo64_64 = call + @__gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfgbo64_64) + %nfgbo64_d = call + @__gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfgbo64_d) +', +` + %pgbo32_8 = call + @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %pgbo32_8) + %pgbo32_16 = call + @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %pgbo32_16) + %pgbo32_32 = call + @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %pgbo32_32) + %pgbo32_f = call + @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usefloat( %pgbo32_f) + %pgbo32_64 = call + @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %pgbo32_64) + %pgbo32_d = call + @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usedouble( %pgbo32_d) + + %pgbo64_8 = call + @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use8( %pgbo64_8) + %pgbo64_16 = call + @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use16( %pgbo64_16) + %pgbo64_32 = call + @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use32( %pgbo64_32) + %pgbo64_f = call + @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__usefloat( %pgbo64_f) + %pgbo64_64 = call + @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use64( %pgbo64_64) + %pgbo64_d = call + @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__usedouble( %pgbo64_d) + + %gbo32_8 = call + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %gbo32_8) + %gbo32_16 = call + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %gbo32_16) + %gbo32_32 = call + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %gbo32_32) + %gbo32_f = call + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usefloat( %gbo32_f) + %gbo32_64 = call + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %gbo32_64) + %gbo32_d = call + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usedouble( %gbo32_d) + + %gbo64_8 = call + @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use8( %gbo64_8) + %gbo64_16 = call + @__gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use16( %gbo64_16) + %gbo64_32 = call + @__gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use32( %gbo64_32) + %gbo64_f = call + @__gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__usefloat( %gbo64_f) + %gbo64_64 = call + @__gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__use64( %gbo64_64) + %gbo64_d = call + @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__usedouble( %pgbo64_d) +') + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; scatters + + call void @__pseudo_scatter32_i8( %v32, %v8, %mask) + call void @__pseudo_scatter32_i16( %v32, %v16, %mask) + call void @__pseudo_scatter32_i32( %v32, %v32, %mask) + call void @__pseudo_scatter32_float( %v32, %vf, %mask) + call void @__pseudo_scatter32_i64( %v32, %v64, %mask) + call void @__pseudo_scatter32_double( %v32, %vd, %mask) + + call void @__pseudo_scatter64_i8( %v64, %v8, %mask) + call void @__pseudo_scatter64_i16( %v64, %v16, %mask) + call void @__pseudo_scatter64_i32( %v64, %v32, %mask) + call void @__pseudo_scatter64_float( %v64, %vf, %mask) + call void @__pseudo_scatter64_i64( %v64, %v64, %mask) + call void @__pseudo_scatter64_double( %v64, %vd, %mask) + + call void @__scatter32_i8( %v32, %v8, %mask) + call void @__scatter32_i16( %v32, %v16, %mask) + call void @__scatter32_i32( %v32, %v32, %mask) + call void @__scatter32_float( %v32, %vf, %mask) + call void @__scatter32_i64( %v32, %v64, %mask) + call void @__scatter32_double( %v32, %vd, %mask) + + call void @__scatter64_i8( %v64, %v8, %mask) + call void @__scatter64_i16( %v64, %v16, %mask) + call void @__scatter64_i32( %v64, %v32, %mask) + call void @__scatter64_float( %v64, %vf, %mask) + call void @__scatter64_i64( %v64, %v64, %mask) + call void @__scatter64_double( %v64, %vd, %mask) + +ifelse(HAVE_SCATTER, `1', +` + call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) + + call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) + + call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) +', +` + call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + %vf, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + %vd, %mask) + + call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + %vf, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + %vd, %mask) + + call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + %vf, %mask) + call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + call void @__scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + %vd, %mask) + + call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + %vf, %mask) + call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + %vd, %mask) +') + + ret void +} + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; various bitcasts from one type to another + +define @__intbits_varying_float() nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast %0 to + ret %float_to_int_bitcast +} + +define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast float %0 to i32 + ret i32 %float_to_int_bitcast +} + +define @__intbits_varying_double() nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast %0 to + ret %double_to_int_bitcast +} + +define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast double %0 to i64 + ret i64 %double_to_int_bitcast +} + +define @__floatbits_varying_int32() nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast %0 to + ret %int_to_float_bitcast +} + +define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast i32 %0 to float + ret float %int_to_float_bitcast +} + +define @__doublebits_varying_int64() nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast %0 to + ret %int_to_double_bitcast +} + +define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast i64 %0 to double + ret double %int_to_double_bitcast +} + +define @__undef_varying() nounwind readnone alwaysinline { + ret undef +} + +define float @__undef_uniform() nounwind readnone alwaysinline { + ret float undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sign extension + +define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { + %r = sext i1 %0 to i32 + ret i32 %r +} + +define @__sext_varying_bool() nounwind readnone alwaysinline { +;; ifelse(MASK,i32, `ret %0', +;; `%se = sext %0 to +;; ret %se') + ifelse(MASK,i32, `%se = bitcast %0 to ', + MASK,i64, `%se = trunc %0 to ', + `%se = sext %0 to ') + ret %se +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; memcpy/memmove/memset + +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, + i32 %len, i32 %align, i1 %isvolatile) +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* %dest, i8* %src, + i64 %len, i32 %align, i1 %isvolatile) + +declare void @__memcpy32(i8 * %dst, i8 * %src, i32 %len) alwaysinline; +declare void @__memcpy64(i8 * %dst, i8 * %src, i64 %len) alwaysinline; + +declare void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %src, + i32 %len, i32 %align, i1 %isvolatile) +declare void @llvm.memmove.p0i8.p0i8.i64(i8* %dest, i8* %src, + i64 %len, i32 %align, i1 %isvolatile) + +declare void @__memmove32(i8 * %dst, i8 * %src, i32 %len) alwaysinline; +declare void @__memmove64(i8 * %dst, i8 * %src, i64 %len) alwaysinline + +declare void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 %len, i32 %align, + i1 %isvolatile) +declare void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %len, i32 %align, + i1 %isvolatile) + +declare void @__memset32(i8 * %dst, i8 %val, i32 %len) alwaysinline ; +declare void @__memset64(i8 * %dst, i8 %val, i64 %len) alwaysinline; + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; new/delete + +;; Set of functions for 32 bit runtime. +;; They are different for Windows and Unix (Linux/MacOS), +;; on Windows we have to use _aligned_malloc/_aligned_free, +;; while on Unix we use posix_memalign/free +;; +;; Note that this should be really two different libraries for 32 and 64 +;; environment and it should happen sooner or later + +ifelse(WIDTH, 1, `define(`ALIGNMENT', `16')', `define(`ALIGNMENT', `eval(WIDTH*4)')') + +@memory_alignment = internal constant i32 ALIGNMENT + +ifelse(BUILD_OS, `UNIX', +` + +ifelse(RUNTIME, `32', +` + +;; Unix 32 bit environment. +;; Use: posix_memalign and free +;; Define: +;; - __new_uniform_32rt +;; - __new_varying32_32rt +;; - __delete_uniform_32rt +;; - __delete_varying_32rt + +declare i8* @malloc(i32) +declare i32 @posix_memalign(i8**, i32, i32) +declare void @free(i8 *) + +declare noalias i8 * @__new_uniform_32rt(i64 %size); +declare @__new_varying32_32rt( %size, %mask); +declare void @__delete_uniform_32rt(i8 * %ptr); +declare void @__delete_varying_32rt( %ptr, %mask); + +', +RUNTIME, `64', +` + +;; Unix 64 bit environment. +;; Use: posix_memalign and free +;; Define: +;; - __new_uniform_64rt +;; - __new_varying32_64rt +;; - __new_varying64_64rt +;; - __delete_uniform_64rt +;; - __delete_varying_64rt + +declare i8* @malloc(i64) +declare void @free(i8 *) + +define noalias i8 * @__new_uniform_64rt(i64 %size) +{ +entry: +;; compute laneIdx = __tid_x() & (__warpsize() - 1) + %and = call i32 @__program_index() +;; if (laneIdx == 0) + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call2 = tail call noalias i8* @malloc(i64 %size) + %phitmp = ptrtoint i8* %call2 to i64 + br label %if.end + +if.end: ; preds = %if.then, %entry + %ptr.0 = phi i64 [ %phitmp, %if.then ], [ undef, %entry ] + %val.sroa.0.0.extract.trunc = trunc i64 %ptr.0 to i32 + %call3 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.0.extract.trunc, i32 0) + %val.sroa.0.0.insert.ext = zext i32 %call3 to i64 + %val.sroa.0.4.extract.shift = lshr i64 %ptr.0, 32 + %val.sroa.0.4.extract.trunc = trunc i64 %val.sroa.0.4.extract.shift to i32 + %call8 = tail call i32 @__shfl_i32_nvptx(i32 %val.sroa.0.4.extract.trunc, i32 0) + %val.sroa.0.4.insert.ext = zext i32 %call8 to i64 + %val.sroa.0.4.insert.shift = shl nuw i64 %val.sroa.0.4.insert.ext, 32 + %val.sroa.0.4.insert.insert = or i64 %val.sroa.0.4.insert.shift, %val.sroa.0.0.insert.ext + %0 = inttoptr i64 %val.sroa.0.4.insert.insert to i8* + ret i8* %0 +} +define void @__delete_uniform_64rt(i8 * %ptr) +{ +entry: + %and = call i32 @__program_index() + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + tail call void @free(i8* %ptr) + br label %if.end + +if.end: ; preds = %if.then, %entry + ret void +} + +define <1 x i64> @__new_varying32_64rt(<1 x i32> %sizev, <1 x i1> %maskv) +{ +entry: + %size32 = extractelement <1 x i32> %sizev, i32 0 + %mask = extractelement <1 x i1> %maskv, i32 0 + %size64 = zext i32 %size32 to i64 + br i1 %mask, label %alloc, label %skip + +alloc: + %ptr = tail call noalias i8* @malloc(i64 %size64) + %addr1 = ptrtoint i8* %ptr to i64 + br label %skip + +skip: + %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ] + %addr = insertelement <1 x i64> undef, i64 %addr64, i32 0 + ret <1 x i64> %addr +} + +define <1 x i64> @__new_varying64_64rt(<1 x i64> %sizev, <1 x i1> %maskv) +{ +entry: + %size64 = extractelement <1 x i64> %sizev, i32 0 + %mask = extractelement <1 x i1> %maskv, i32 0 + br i1 %mask, label %alloc, label %skip + +alloc: + %ptr = tail call noalias i8* @malloc(i64 %size64) + %addr1 = ptrtoint i8* %ptr to i64 + br label %skip + +skip: + %addr64 = phi i64 [ %addr1, %alloc], [ 0, %entry ] + %addr = insertelement <1 x i64> undef, i64 %addr64, i32 0 + ret <1 x i64> %addr +} + +define void @__delete_varying_64rt(<1 x i64> %ptrv, <1 x i1> %maskv) +{ +entry: + %addr64 = extractelement <1 x i64> %ptrv, i32 0 + %mask = extractelement <1 x i1> %maskv, i32 0 + br i1 %mask, label %free, label %skip + +free: + %ptr = inttoptr i64 %addr64 to i8* + tail call void @free(i8* %ptr) + br label %skip + +skip: + ret void +} +', ` +errprint(`RUNTIME should be defined to either 32 or 64 +') +m4exit(`1') +') + +', +BUILD_OS, `WINDOWS', +` + +ifelse(RUNTIME, `32', +` + +;; Windows 32 bit environment. +;; Use: _aligned_malloc and _aligned_free +;; Define: +;; - __new_uniform_32rt +;; - __new_varying32_32rt +;; - __delete_uniform_32rt +;; - __delete_varying_32rt + +declare i8* @_aligned_malloc(i32, i32) +declare void @_aligned_free(i8 *) + +define noalias i8 * @__new_uniform_32rt(i64 %size) { + %conv = trunc i64 %size to i32 + %alignment = load i32* @memory_alignment + %ptr = tail call i8* @_aligned_malloc(i32 %conv, i32 %alignment) + ret i8* %ptr +} + +define @__new_varying32_32rt( %size, %mask) { + %ret = alloca + store zeroinitializer, * %ret + %ret64 = bitcast * %ret to i64 * + %alignment = load i32* @memory_alignment + + per_lane(WIDTH, %mask, ` + %sz_LANE_ID = extractelement %size, i32 LANE + %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i32 %sz_LANE_ID, i32 %alignment) + %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 + %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') + + %r = load * %ret + ret %r +} + +define void @__delete_uniform_32rt(i8 * %ptr) { + call void @_aligned_free(i8 * %ptr) + ret void +} + +define void @__delete_varying_32rt( %ptr, %mask) { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 * + call void @_aligned_free(i8 * %ptr_LANE_ID) + ') + ret void +} + +', +RUNTIME, `64', +` + +;; Windows 64 bit environment. +;; Use: _aligned_malloc and _aligned_free +;; Define: +;; - __new_uniform_64rt +;; - __new_varying32_64rt +;; - __new_varying64_64rt +;; - __delete_uniform_64rt +;; - __delete_varying_64rt + +declare i8* @_aligned_malloc(i64, i64) +declare void @_aligned_free(i8 *) + +define noalias i8 * @__new_uniform_64rt(i64 %size) { + %alignment = load i32* @memory_alignment + %alignment64 = sext i32 %alignment to i64 + %ptr = tail call i8* @_aligned_malloc(i64 %size, i64 %alignment64) + ret i8* %ptr +} + +define @__new_varying32_64rt( %size, %mask) { + %ret = alloca + store zeroinitializer, * %ret + %ret64 = bitcast * %ret to i64 * + %alignment = load i32* @memory_alignment + %alignment64 = sext i32 %alignment to i64 + + per_lane(WIDTH, %mask, ` + %sz_LANE_ID = extractelement %size, i32 LANE + %sz64_LANE_ID = zext i32 %sz_LANE_ID to i64 + %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) + %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 + %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') + + %r = load * %ret + ret %r +} + +define @__new_varying64_64rt( %size, %mask) { + %ret = alloca + store zeroinitializer, * %ret + %ret64 = bitcast * %ret to i64 * + %alignment = load i32* @memory_alignment + %alignment64 = sext i32 %alignment to i64 + + per_lane(WIDTH, %mask, ` + %sz64_LANE_ID = extractelement %size, i32 LANE + %ptr_LANE_ID = call noalias i8 * @_aligned_malloc(i64 %sz64_LANE_ID, i64 %alignment64) + %ptr_int_LANE_ID = ptrtoint i8 * %ptr_LANE_ID to i64 + %store_LANE_ID = getelementptr i64 * %ret64, i32 LANE + store i64 %ptr_int_LANE_ID, i64 * %store_LANE_ID') + + %r = load * %ret + ret %r +} + +define void @__delete_uniform_64rt(i8 * %ptr) { + call void @_aligned_free(i8 * %ptr) + ret void +} + +define void @__delete_varying_64rt( %ptr, %mask) { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptr, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to i8 * + call void @_aligned_free(i8 * %ptr_LANE_ID) + ') + ret void +} + +', ` +errprint(`RUNTIME should be defined to either 32 or 64 +') +m4exit(`1') +') + +', +` +errprint(`BUILD_OS should be defined to either UNIX or WINDOWS +') +m4exit(`1') +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; read hw clock + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; stdlib transcendentals +;; +;; These functions provide entrypoints that call out to the libm +;; implementations of the transcendental functions +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +declare float @sinf(float) nounwind readnone +declare float @cosf(float) nounwind readnone +declare void @sincosf(float, float *, float *) nounwind readnone +declare float @asinf(float) nounwind readnone +declare float @acosf(float) nounwind readnone +declare float @tanf(float) nounwind readnone +declare float @atanf(float) nounwind readnone +declare float @atan2f(float, float) nounwind readnone +declare float @expf(float) nounwind readnone +declare float @logf(float) nounwind readnone +declare float @powf(float, float) nounwind readnone + +define float @__stdlib_sinf(float) nounwind readnone alwaysinline { + %r = call float @sinf(float %0) + ret float %r +} + +define float @__stdlib_cosf(float) nounwind readnone alwaysinline { + %r = call float @cosf(float %0) + ret float %r +} + +define void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline { + call void @sincosf(float %0, float *%1, float *%2) + ret void +} + +define float @__stdlib_asinf(float) nounwind readnone alwaysinline { + %r = call float @asinf(float %0) + ret float %r +} + +define float @__stdlib_acosf(float) nounwind readnone alwaysinline { + %r = call float @acosf(float %0) + ret float %r +} + +define float @__stdlib_tanf(float) nounwind readnone alwaysinline { + %r = call float @tanf(float %0) + ret float %r +} + +define float @__stdlib_atanf(float) nounwind readnone alwaysinline { + %r = call float @atanf(float %0) + ret float %r +} + +define float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline { + %r = call float @atan2f(float %0, float %1) + ret float %r +} + +define float @__stdlib_logf(float) nounwind readnone alwaysinline { + %r = call float @logf(float %0) + ret float %r +} + +define float @__stdlib_expf(float) nounwind readnone alwaysinline { + %r = call float @expf(float %0) + ret float %r +} + +define float @__stdlib_powf(float, float) nounwind readnone alwaysinline { + %r = call float @powf(float %0, float %1) + ret float %r +} + +declare double @sin(double) nounwind readnone +declare double @asin(double) nounwind readnone +declare double @cos(double) nounwind readnone +declare void @sincos(double, double *, double *) nounwind readnone +declare double @tan(double) nounwind readnone +declare double @atan(double) nounwind readnone +declare double @atan2(double, double) nounwind readnone +declare double @exp(double) nounwind readnone +declare double @log(double) nounwind readnone +declare double @pow(double, double) nounwind readnone + +define double @__stdlib_sin(double) nounwind readnone alwaysinline { + %r = call double @sin(double %0) + ret double %r +} + +define double @__stdlib_asin(double) nounwind readnone alwaysinline { + %r = call double @asin(double %0) + ret double %r +} + +define double @__stdlib_cos(double) nounwind readnone alwaysinline { + %r = call double @cos(double %0) + ret double %r +} + +define void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline { + call void @sincos(double %0, double *%1, double *%2) + ret void +} + +define double @__stdlib_tan(double) nounwind readnone alwaysinline { + %r = call double @tan(double %0) + ret double %r +} + +define double @__stdlib_atan(double) nounwind readnone alwaysinline { + %r = call double @atan(double %0) + ret double %r +} + +define double @__stdlib_atan2(double, double) nounwind readnone alwaysinline { + %r = call double @atan2(double %0, double %1) + ret double %r +} + +define double @__stdlib_log(double) nounwind readnone alwaysinline { + %r = call double @log(double %0) + ret double %r +} + +define double @__stdlib_exp(double) nounwind readnone alwaysinline { + %r = call double @exp(double %0) + ret double %r +} + +define double @__stdlib_pow(double, double) nounwind readnone alwaysinline { + %r = call double @pow(double %0, double %1) + ret double %r +} + + +') + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 64-bit integer min and max functions + +;; utility function used by int64minmax below. This shouldn't be called by +;; target .ll files directly. +;; $1: target vector width +;; $2: {min,max} (used in constructing function names) +;; $3: {int64,uint64} (used in constructing function names) +;; $4: {slt,sgt} comparison operator to used + +define(`i64minmax', ` +define i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone { + %c = icmp $4 i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone { + %rptr = alloca <$1 x i64> + %r64ptr = bitcast <$1 x i64> * %rptr to i64 * + + forloop(i, 0, eval($1-1), ` + %v0_`'i = extractelement <$1 x i64> %0, i32 i + %v1_`'i = extractelement <$1 x i64> %1, i32 i + %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i + %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i + %ptr_`'i = getelementptr i64 * %r64ptr, i32 i + store i64 %v_`'i, i64 * %ptr_`'i +') + + %ret = load <$1 x i64> * %rptr + ret <$1 x i64> %ret +} +') + +;; this is the function that target .ll files should call; it just takes the target +;; vector width as a parameter + +define(`int64minmax', ` +i64minmax(WIDTH,min,int64,slt) +i64minmax(WIDTH,max,int64,sgt) +i64minmax(WIDTH,min,uint64,ult) +i64minmax(WIDTH,max,uint64,ugt) +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Emit general-purpose code to do a masked load for targets that dont have +;; an instruction to do that. Parameters: +;; $1: element type for which to emit the function (i32, i64, ...) (and suffix for function name) +;; $2: alignment for elements of type $1 (4, 8, ...) + +define(`masked_load', ` +define @__masked_load_$1(i8 *, %mask) nounwind alwaysinline { +entry: + %mm = call i64 @__movmsk( %mask) + + ; if the first lane and the last lane are on, then it is safe to do a vector load + ; of the whole thing--what the lanes in the middle want turns out to not matter... + %mm_and_low = and i64 %mm, 1 + %mm_and_high = and i64 %mm, MASK_HIGH_BIT_ON + %mm_and_high_shift = lshr i64 %mm_and_high, eval(WIDTH-1) + %mm_and_low_i1 = trunc i64 %mm_and_low to i1 + %mm_and_high_shift_i1 = trunc i64 %mm_and_high_shift to i1 + %can_vload = and i1 %mm_and_low_i1, %mm_and_high_shift_i1 + + %fast32 = call i32 @__fast_masked_vload() + %fast_i1 = trunc i32 %fast32 to i1 + %can_vload_maybe_fast = or i1 %fast_i1, %can_vload + + ; if we are not able to do a singe vload, we will accumulate lanes in this memory.. + %retptr = alloca + %retptr32 = bitcast * %retptr to $1 * + br i1 %can_vload_maybe_fast, label %load, label %loop + +load: + %ptr = bitcast i8 * %0 to * + %valall = load * %ptr, align $2 + ret %valall + +loop: + ; loop over the lanes and see if each one is on... + %lane = phi i32 [ 0, %entry ], [ %next_lane, %lane_done ] + %lane64 = zext i32 %lane to i64 + %lanemask = shl i64 1, %lane64 + %mask_and = and i64 %mm, %lanemask + %do_lane = icmp ne i64 %mask_and, 0 + br i1 %do_lane, label %load_lane, label %lane_done + +load_lane: + ; yes! do the load and store the result into the appropriate place in the + ; allocaed memory above + %ptr32 = bitcast i8 * %0 to $1 * + %lane_ptr = getelementptr $1 * %ptr32, i32 %lane + %val = load $1 * %lane_ptr + %store_ptr = getelementptr $1 * %retptr32, i32 %lane + store $1 %val, $1 * %store_ptr + br label %lane_done + +lane_done: + %next_lane = add i32 %lane, 1 + %done = icmp eq i32 %lane, eval(WIDTH-1) + br i1 %done, label %return, label %loop + +return: + %r = load * %retptr + ret %r +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; masked store +;; emit code to do masked store as a set of per-lane scalar stores +;; parameters: +;; $1: llvm type of elements (and suffix for function name) + +define(`gen_masked_store', ` +define void @__masked_store_$1(* nocapture, , ) nounwind alwaysinline { + per_lane(WIDTH, %2, ` + %ptr_LANE_ID = getelementptr * %0, i32 0, i32 LANE + %storeval_LANE_ID = extractelement %1, i32 LANE + store $1 %storeval_LANE_ID, $1 * %ptr_LANE_ID') + ret void +} +') + +define(`masked_store_blend_8_16_by_4', ` +define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>, + <4 x i32>) nounwind alwaysinline { + %old = load <4 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old32 = bitcast <4 x i8> %old to i32 + %new32 = bitcast <4 x i8> %1 to i32 + + %mask8 = trunc <4 x i32> %2 to <4 x i8> + %mask32 = bitcast <4 x i8> %mask8 to i32 + %notmask32 = xor i32 %mask32, -1 + + %newmasked = and i32 %new32, %mask32 + %oldmasked = and i32 %old32, %notmask32 + %result = or i32 %newmasked, %oldmasked + + %resultvec = bitcast i32 %result to <4 x i8> + ',` + %m = trunc <4 x i32> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old + ') + store <4 x i8> %resultvec, <4 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, + <4 x i32>) nounwind alwaysinline { + %old = load <4 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old64 = bitcast <4 x i16> %old to i64 + %new64 = bitcast <4 x i16> %1 to i64 + + %mask16 = trunc <4 x i32> %2 to <4 x i16> + %mask64 = bitcast <4 x i16> %mask16 to i64 + %notmask64 = xor i64 %mask64, -1 + + %newmasked = and i64 %new64, %mask64 + %oldmasked = and i64 %old64, %notmask64 + %result = or i64 %newmasked, %oldmasked + + %resultvec = bitcast i64 %result to <4 x i16> + ',` + %m = trunc <4 x i32> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old + ') + store <4 x i16> %resultvec, <4 x i16> * %0, align 2 + ret void +} +') + +define(`masked_store_blend_8_16_by_4_mask64', ` +define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old32 = bitcast <4 x i8> %old to i32 + %new32 = bitcast <4 x i8> %1 to i32 + + %mask8 = trunc <4 x i64> %2 to <4 x i8> + %mask32 = bitcast <4 x i8> %mask8 to i32 + %notmask32 = xor i32 %mask32, -1 + + %newmasked = and i32 %new32, %mask32 + %oldmasked = and i32 %old32, %notmask32 + %result = or i32 %newmasked, %oldmasked + + %resultvec = bitcast i32 %result to <4 x i8> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old + ') + store <4 x i8> %resultvec, <4 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>, + <4 x i64>) nounwind alwaysinline { + %old = load <4 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old64 = bitcast <4 x i16> %old to i64 + %new64 = bitcast <4 x i16> %1 to i64 + + %mask16 = trunc <4 x i64> %2 to <4 x i16> + %mask64 = bitcast <4 x i16> %mask16 to i64 + %notmask64 = xor i64 %mask64, -1 + + %newmasked = and i64 %new64, %mask64 + %oldmasked = and i64 %old64, %notmask64 + %result = or i64 %newmasked, %oldmasked + + %resultvec = bitcast i64 %result to <4 x i16> + ',` + %m = trunc <4 x i64> %2 to <4 x i1> + %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old + ') + store <4 x i16> %resultvec, <4 x i16> * %0, align 2 + ret void +} +') + +define(`masked_store_blend_8_16_by_8', ` +define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>, + <8 x i32>) nounwind alwaysinline { + %old = load <8 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old64 = bitcast <8 x i8> %old to i64 + %new64 = bitcast <8 x i8> %1 to i64 + + %mask8 = trunc <8 x i32> %2 to <8 x i8> + %mask64 = bitcast <8 x i8> %mask8 to i64 + %notmask64 = xor i64 %mask64, -1 + + %newmasked = and i64 %new64, %mask64 + %oldmasked = and i64 %old64, %notmask64 + %result = or i64 %newmasked, %oldmasked + + %resultvec = bitcast i64 %result to <8 x i8> + ',` + %m = trunc <8 x i32> %2 to <8 x i1> + %resultvec = select <8 x i1> %m, <8 x i8> %1, <8 x i8> %old + ') + store <8 x i8> %resultvec, <8 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<8 x i16>* nocapture, <8 x i16>, + <8 x i32>) nounwind alwaysinline { + %old = load <8 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old128 = bitcast <8 x i16> %old to i128 + %new128 = bitcast <8 x i16> %1 to i128 + + %mask16 = trunc <8 x i32> %2 to <8 x i16> + %mask128 = bitcast <8 x i16> %mask16 to i128 + %notmask128 = xor i128 %mask128, -1 + + %newmasked = and i128 %new128, %mask128 + %oldmasked = and i128 %old128, %notmask128 + %result = or i128 %newmasked, %oldmasked + + %resultvec = bitcast i128 %result to <8 x i16> + ',` + %m = trunc <8 x i32> %2 to <8 x i1> + %resultvec = select <8 x i1> %m, <8 x i16> %1, <8 x i16> %old + ') + store <8 x i16> %resultvec, <8 x i16> * %0, align 2 + ret void +} +') + + +define(`masked_store_blend_8_16_by_16', ` +define void @__masked_store_blend_i8(<16 x i8>* nocapture, <16 x i8>, + <16 x i32>) nounwind alwaysinline { + %old = load <16 x i8> * %0, align 1 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old128 = bitcast <16 x i8> %old to i128 + %new128 = bitcast <16 x i8> %1 to i128 + + %mask8 = trunc <16 x i32> %2 to <16 x i8> + %mask128 = bitcast <16 x i8> %mask8 to i128 + %notmask128 = xor i128 %mask128, -1 + + %newmasked = and i128 %new128, %mask128 + %oldmasked = and i128 %old128, %notmask128 + %result = or i128 %newmasked, %oldmasked + + %resultvec = bitcast i128 %result to <16 x i8> + ',` + %m = trunc <16 x i32> %2 to <16 x i1> + %resultvec = select <16 x i1> %m, <16 x i8> %1, <16 x i8> %old + ') + store <16 x i8> %resultvec, <16 x i8> * %0, align 1 + ret void +} + +define void @__masked_store_blend_i16(<16 x i16>* nocapture, <16 x i16>, + <16 x i32>) nounwind alwaysinline { + %old = load <16 x i16> * %0, align 2 + ifelse(LLVM_VERSION,LLVM_3_0,` + %old256 = bitcast <16 x i16> %old to i256 + %new256 = bitcast <16 x i16> %1 to i256 + + %mask16 = trunc <16 x i32> %2 to <16 x i16> + %mask256 = bitcast <16 x i16> %mask16 to i256 + %notmask256 = xor i256 %mask256, -1 + + %newmasked = and i256 %new256, %mask256 + %oldmasked = and i256 %old256, %notmask256 + %result = or i256 %newmasked, %oldmasked + + %resultvec = bitcast i256 %result to <16 x i16> + ',` + %m = trunc <16 x i32> %2 to <16 x i1> + %resultvec = select <16 x i1> %m, <16 x i16> %1, <16 x i16> %old + ') + store <16 x i16> %resultvec, <16 x i16> * %0, align 2 + ret void +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; packed load and store functions +;; +;; These define functions to emulate those nice packed load and packed store +;; instructions. For packed store, given a pointer to destination array and +;; an offset into the array, for each lane where the mask is on, the +;; corresponding value for that lane is stored into packed locations in the +;; destination array. For packed load, each lane that has an active mask +;; loads a sequential value from the array. +;; +;; $1: vector width of the target +;; +;; FIXME: use the per_lane macro, defined below, to implement these! + +define(`packed_load_and_store', ` + +define i32 @__packed_load_active(i32 * %startptr, <1 x i32> * %val_ptr, + <1 x i1> %full_mask) nounwind alwaysinline { +entry: + %active = extractelement <1 x i1> %full_mask, i32 0 + %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active) + %res.sroa.0.0.extract.trunc = trunc i64 %call to i32 + br i1 %active, label %if.then, label %if.end + +if.then: ; preds = %entry + %idxprom = ashr i64 %call, 32 + %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom + %val = load i32* %arrayidx, align 4 + %valvec = insertelement <1 x i32> undef, i32 %val, i32 0 + store <1 x i32> %valvec, <1 x i32>* %val_ptr, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret i32 %res.sroa.0.0.extract.trunc +} + +define i32 @__packed_store_active(i32 * %startptr, %vals, + %full_mask) nounwind alwaysinline +{ +entry: + %active = extractelement <1 x i1> %full_mask, i32 0 + %call = tail call i64 @__warpBinExclusiveScan(i1 zeroext %active) + %res.sroa.0.0.extract.trunc = trunc i64 %call to i32 + br i1 %active, label %if.then, label %if.end + +if.then: ; preds = %entry + %idxprom = ashr i64 %call, 32 + %arrayidx = getelementptr inbounds i32* %startptr, i64 %idxprom + %val = extractelement <1 x i32> %vals, i32 0 + store i32 %val, i32* %arrayidx, align 4 + br label %if.end + +if.end: ; preds = %if.then, %entry + ret i32 %res.sroa.0.0.extract.trunc +} + +define i32 @__packed_store_active2(i32 * %startptr, <1 x i32> %vals, + <1 x i1> %full_mask) nounwind alwaysinline +{ + %ret = call i32 @__packed_store_active(i32* %startptr, + <1 x i32> %vals, <1 x i1> %full_mask); + ret i32 %ret +} +') + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; reduce_equal + +;; count leading/trailing zeros +;; Macros declares set of count-trailing and count-leading zeros. +;; Macros behaves as a static functon - it works only at first invokation +;; to avoid redifinition. +define(`declare_count_zeros', ` +ifelse(count_zeros_are_defined, true, `', +` +declare i32 @llvm.ctlz.i32(i32) +declare i64 @llvm.ctlz.i64(i64) +declare i32 @llvm.cttz.i32(i32) +declare i64 @llvm.cttz.i64(i64) + +define(`count_zeros_are_defined', true) +') + +') + +define(`reduce_equal_aux', ` +declare_count_zeros() + +define i1 @__reduce_equal_$3(<$1 x $2> %v, $2 * %samevalue, + <$1 x MASK> %mask) nounwind alwaysinline { +entry: + %mm = call i64 @__movmsk(<$1 x MASK> %mask) + %allon = icmp eq i64 %mm, ALL_ON_MASK + br i1 %allon, label %check_neighbors, label %domixed + +domixed: + ; First, figure out which lane is the first active one + %first = call i64 @llvm.cttz.i64(i64 %mm) + %first32 = trunc i64 %first to i32 + %baseval = extractelement <$1 x $2> %v, i32 %first32 + %basev1 = insertelement <$1 x $2> undef, $2 %baseval, i32 0 + ; get a vector that is that value smeared across all elements + %basesmear = shufflevector <$1 x $2> %basev1, <$1 x $2> undef, + <$1 x i32> < forloop(i, 0, eval($1-2), `i32 0, ') i32 0 > + + ; now to a blend of that vector with the original vector, such that the + ; result will be the original value for the active lanes, and the value + ; from the first active lane for the inactive lanes. Given that, we can + ; just unconditionally check if the lanes are all equal in check_neighbors + ; below without worrying about inactive lanes... + %ptr = alloca <$1 x $2> + store <$1 x $2> %basesmear, <$1 x $2> * %ptr + %castptr = bitcast <$1 x $2> * %ptr to <$1 x $4> * + %castv = bitcast <$1 x $2> %v to <$1 x $4> + call void @__masked_store_blend_i$6(<$1 x $4> * %castptr, <$1 x $4> %castv, <$1 x MASK> %mask) + %blendvec = load <$1 x $2> * %ptr + br label %check_neighbors + +check_neighbors: + %vec = phi <$1 x $2> [ %blendvec, %domixed ], [ %v, %entry ] + ifelse($6, `32', ` + ; For 32-bit elements, we rotate once and compare with the vector, which ends + ; up comparing each element to its neighbor on the right. Then see if + ; all of those values are true; if so, then all of the elements are equal.. + %castvec = bitcast <$1 x $2> %vec to <$1 x $4> + %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) + %vr = bitcast <$1 x $4> %castvr to <$1 x $2> + %eq = $5 $7 <$1 x $2> %vec, %vr + ifelse(MASK,i1, ` + %eqmm = call i64 @__movmsk(<$1 x MASK> %eq)', + `%eqm = sext <$1 x i1> %eq to <$1 x MASK> + %eqmm = call i64 @__movmsk(<$1 x MASK> %eqm)') + %alleq = icmp eq i64 %eqmm, ALL_ON_MASK + br i1 %alleq, label %all_equal, label %not_all_equal + ', ` + ; But for 64-bit elements, it turns out to be more efficient to just + ; scalarize and do a individual pairwise comparisons and AND those + ; all together.. + forloop(i, 0, eval($1-1), ` + %v`'i = extractelement <$1 x $2> %vec, i32 i') + + forloop(i, 0, eval($1-2), ` + %eq`'i = $5 $7 $2 %v`'i, %v`'eval(i+1)') + + %and0 = and i1 %eq0, %eq1 + forloop(i, 1, eval($1-3), ` + %and`'i = and i1 %and`'eval(i-1), %eq`'eval(i+1)') + + br i1 %and`'eval($1-3), label %all_equal, label %not_all_equal + ') + +all_equal: + %the_value = extractelement <$1 x $2> %vec, i32 0 + store $2 %the_value, $2 * %samevalue + ret i1 true + +not_all_equal: + ret i1 false +} +') + +define(`reduce_equal', ` +reduce_equal_aux($1, i32, int32, i32, icmp, 32, eq) +reduce_equal_aux($1, float, float, i32, fcmp, 32, oeq) +reduce_equal_aux($1, i64, int64, i64, icmp, 64, eq) +reduce_equal_aux($1, double, double, i64, fcmp, 64, oeq) +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; per_lane +;; +;; The scary macro below encapsulates the 'scalarization' idiom--i.e. we have +;; some operation that we'd like to perform only for the lanes where the +;; mask is on +;; $1: vector width of the target +;; $2: variable that holds the mask +;; $3: block of code to run for each lane that is on +;; Inside this code, any instances of the text "LANE" are replaced +;; with an i32 value that represents the current lane number + +; num lanes, mask, code block to do per lane +define(`per_lane', ` + br label %pl_entry + +pl_entry: + %pl_mask = call i64 @__movmsk($2) + %pl_mask_known = call i1 @__is_compile_time_constant_mask($2) + br i1 %pl_mask_known, label %pl_known_mask, label %pl_unknown_mask + +pl_known_mask: + ;; the mask is known at compile time; see if it is something we can + ;; handle more efficiently + %pl_is_allon = icmp eq i64 %pl_mask, ALL_ON_MASK + br i1 %pl_is_allon, label %pl_all_on, label %pl_unknown_mask + +pl_all_on: + ;; the mask is all on--just expand the code for each lane sequentially + forloop(i, 0, eval($1-1), + `patsubst(`$3', `LANE', i)') + br label %pl_done + +pl_unknown_mask: + ;; we just run the general case, though we could + ;; try to be smart and just emit the code based on what it actually is, + ;; for example by emitting the code straight-line without a loop and doing + ;; the lane tests explicitly, leaving later optimization passes to eliminate + ;; the stuff that is definitely not needed. Not clear if we will frequently + ;; encounter a mask that is known at compile-time but is not either all on or + ;; all off... + br label %pl_loop + +pl_loop: + ;; Loop over each lane and see if we want to do the work for this lane + %pl_lane = phi i32 [ 0, %pl_unknown_mask ], [ %pl_nextlane, %pl_loopend ] + %pl_lanemask = phi i64 [ 1, %pl_unknown_mask ], [ %pl_nextlanemask, %pl_loopend ] + + ; is the current lane on? if so, goto do work, otherwise to end of loop + %pl_and = and i64 %pl_mask, %pl_lanemask + %pl_doit = icmp eq i64 %pl_and, %pl_lanemask + br i1 %pl_doit, label %pl_dolane, label %pl_loopend + +pl_dolane: + ;; If so, substitute in the code from the caller and replace the LANE + ;; stuff with the current lane number + patsubst(`patsubst(`$3', `LANE_ID', `_id')', `LANE', `%pl_lane') + br label %pl_loopend + +pl_loopend: + %pl_nextlane = add i32 %pl_lane, 1 + %pl_nextlanemask = mul i64 %pl_lanemask, 2 + + ; are we done yet? + %pl_test = icmp ne i32 %pl_nextlane, $1 + br i1 %pl_test, label %pl_loop, label %pl_done + +pl_done: +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather +;; +;; $1: scalar type for which to generate functions to do gathers + +define(`gen_gather_general', ` +; fully general 32-bit gather, takes array of pointers encoded as vector of i32s +define @__gather32_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} + +; fully general 64-bit gather, takes array of pointers encoded as vector of i32s +define @__gather64_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} +') + +; vec width, type +define(`gen_gather_factored', ` +;; Define the utility function to do the gather operation for a single element +;; of the type +define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, + i32 %lane) nounwind readonly alwaysinline { + ; compute address for this one from the base + %offset32 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset64 = sext i32 %offset32 to i64 + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + + %delta = extractelement %offset_delta, i32 %lane + %delta64 = sext i32 %delta to i64 + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + ; load value and insert into returned value + %ptrcast = bitcast i8 * %finalptr to $1 * + %val = load $1 *%ptrcast + %updatedret = insertelement %ret, $1 %val, i32 %lane + ret %updatedret +} + +define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %ret, + i32 %lane) nounwind readonly alwaysinline { + ; compute address for this one from the base + %offset64 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset_scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %offset_scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + + %delta64 = extractelement %offset_delta, i32 %lane + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + ; load value and insert into returned value + %ptrcast = bitcast i8 * %finalptr to $1 * + %val = load $1 *%ptrcast + %updatedret = insertelement %ret, $1 %val, i32 %lane + ret %updatedret +} + + +define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { + ; We can be clever and avoid the per-lane stuff for gathers if we are willing + ; to require that the 0th element of the array being gathered from is always + ; legal to read from (and we do indeed require that, given the benefits!) + ; + ; Set the offset to zero for lanes that are off + %offsetsPtr = alloca + store zeroinitializer, * %offsetsPtr + call void @__masked_store_blend_i32( * %offsetsPtr, %offsets, + %vecmask) + %newOffsets = load * %offsetsPtr + + %deltaPtr = alloca + store zeroinitializer, * %deltaPtr + call void @__masked_store_blend_i32( * %deltaPtr, %offset_delta, + %vecmask) + %newDelta = load * %deltaPtr + + %ret0 = call @__gather_elt32_$1(i8 * %ptr, %newOffsets, + i32 %offset_scale, %newDelta, + undef, i32 0) + forloop(lane, 1, eval(WIDTH-1), + `patsubst(patsubst(`%retLANE = call @__gather_elt32_$1(i8 * %ptr, + %newOffsets, i32 %offset_scale, %newDelta, + %retPREV, i32 LANE) + ', `LANE', lane), `PREV', eval(lane-1))') + ret %ret`'eval(WIDTH-1) +} + +define @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, + %vecmask) nounwind readonly alwaysinline { + ; We can be clever and avoid the per-lane stuff for gathers if we are willing + ; to require that the 0th element of the array being gathered from is always + ; legal to read from (and we do indeed require that, given the benefits!) + ; + ; Set the offset to zero for lanes that are off + %offsetsPtr = alloca + store zeroinitializer, * %offsetsPtr + call void @__masked_store_blend_i64( * %offsetsPtr, %offsets, + %vecmask) + %newOffsets = load * %offsetsPtr + + %deltaPtr = alloca + store zeroinitializer, * %deltaPtr + call void @__masked_store_blend_i64( * %deltaPtr, %offset_delta, + %vecmask) + %newDelta = load * %deltaPtr + + %ret0 = call @__gather_elt64_$1(i8 * %ptr, %newOffsets, + i32 %offset_scale, %newDelta, + undef, i32 0) + forloop(lane, 1, eval(WIDTH-1), + `patsubst(patsubst(`%retLANE = call @__gather_elt64_$1(i8 * %ptr, + %newOffsets, i32 %offset_scale, %newDelta, + %retPREV, i32 LANE) + ', `LANE', lane), `PREV', eval(lane-1))') + ret %ret`'eval(WIDTH-1) +} + +gen_gather_general($1) +' +) + +; vec width, type +define(`gen_gather', ` + +gen_gather_factored($1) + +define +@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale_vec = bitcast i32 %offset_scale to <1 x i32> + %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, + zeroinitializer, %vecmask) + ret %v +} + +define +@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale64 = zext i32 %offset_scale to i64 + %scale_vec = bitcast i64 %scale64 to <1 x i64> + %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, + i32 1, zeroinitializer, %vecmask) + ret %v +} + +' +) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gen_scatter +;; Emit a function declaration for a scalarized scatter. +;; +;; $1: scalar type for which we want to generate code to scatter + +define(`gen_scatter', ` +;; Define the function that descripes the work to do to scatter a single +;; value +define void @__scatter_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %values, + i32 %lane) nounwind alwaysinline { + %offset32 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %offset64 = sext i32 %offset32 to i64 + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + + %delta = extractelement %offset_delta, i32 %lane + %delta64 = sext i32 %delta to i64 + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + %ptrcast = bitcast i8 * %finalptr to $1 * + %storeval = extractelement %values, i32 %lane + store $1 %storeval, $1 * %ptrcast + ret void +} + +define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_scale, + %offset_delta, %values, + i32 %lane) nounwind alwaysinline { + %offset64 = extractelement %offsets, i32 %lane + ; the order and details of the next 4 lines are important--they match LLVMs + ; patterns that apply the free x86 2x/4x/8x scaling in addressing calculations + %scale64 = sext i32 %offset_scale to i64 + %offset = mul i64 %offset64, %scale64 + %ptroffset = getelementptr i8 * %ptr, i64 %offset + + %delta64 = extractelement %offset_delta, i32 %lane + %finalptr = getelementptr i8 * %ptroffset, i64 %delta64 + + %ptrcast = bitcast i8 * %finalptr to $1 * + %storeval = extractelement %values, i32 %lane + store $1 %storeval, $1 * %ptrcast + ret void +} + +define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, + %offset_delta, %values, + %mask) nounwind alwaysinline { + ;; And use the `per_lane' macro to do all of the per-lane work for scatter... + per_lane(WIDTH, %mask, ` + call void @__scatter_elt32_$1(i8 * %base, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 LANE)') + ret void +} + +define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, + %offset_delta, %values, + %mask) nounwind alwaysinline { + ;; And use the `per_lane' macro to do all of the per-lane work for scatter... + per_lane(WIDTH, %mask, ` + call void @__scatter_elt64_$1(i8 * %base, %offsets, i32 %offset_scale, + %offset_delta, %values, i32 LANE)') + ret void +} + +; fully general 32-bit scatter, takes array of pointers encoded as vector of i32s +define void @__scatter32_$1( %ptrs, %values, + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = extractelement %values, i32 LANE + store $1 %val_LANE_ID, $1 * %ptr_LANE_ID + ') + ret void +} + +; fully general 64-bit scatter, takes array of pointers encoded as vector of i64s +define void @__scatter64_$1( %ptrs, %values, + %mask) nounwind alwaysinline { + per_lane(WIDTH, %mask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = extractelement %values, i32 LANE + store $1 %val_LANE_ID, $1 * %ptr_LANE_ID + ') + ret void +} + +' +) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +define(`rdrand_decls', ` +declare i1 @__rdrand_i16(i16 * nocapture) +declare i1 @__rdrand_i32(i32 * nocapture) +declare i1 @__rdrand_i64(i64 * nocapture) +') + +define(`rdrand_definition', ` +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +declare {i16, i32} @llvm.x86.rdrand.16() +declare {i32, i32} @llvm.x86.rdrand.32() +declare {i64, i32} @llvm.x86.rdrand.64() + +define i1 @__rdrand_i16(i16 * %ptr) { + %v = call {i16, i32} @llvm.x86.rdrand.16() + %v0 = extractvalue {i16, i32} %v, 0 + %v1 = extractvalue {i16, i32} %v, 1 + store i16 %v0, i16 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i32(i32 * %ptr) { + %v = call {i32, i32} @llvm.x86.rdrand.32() + %v0 = extractvalue {i32, i32} %v, 0 + %v1 = extractvalue {i32, i32} %v, 1 + store i32 %v0, i32 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i64(i64 * %ptr) { + %v = call {i64, i32} @llvm.x86.rdrand.64() + %v0 = extractvalue {i64, i32} %v, 0 + %v1 = extractvalue {i64, i32} %v, 1 + store i64 %v0, i64 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int8/int16 builtins + +define(`define_avg_up_uint8', ` +define @__avg_up_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int8', ` +define @__avg_up_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum1 = add %a16, %b16 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_uint16', ` +define @__avg_up_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_up_int16', ` +define @__avg_up_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum1 = add %a32, %b32 + %sum = add %sum1, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint8', ` +define @__avg_down_uint8(, ) { + %a16 = zext %0 to + %b16 = zext %1 to + %sum = add %a16, %b16 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i16 1, ') i16 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int8', ` +define @__avg_down_int8(, ) { + %a16 = sext %0 to + %b16 = sext %1 to + %sum = add %a16, %b16 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i16 2, ') i16 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_uint16', ` +define @__avg_down_uint16(, ) { + %a32 = zext %0 to + %b32 = zext %1 to + %sum = add %a32, %b32 + %avg = lshr %sum, < forloop(i, 1, eval(WIDTH-1), `i32 1, ') i32 1 > + %r = trunc %avg to + ret %r +}') + +define(`define_avg_down_int16', ` +define @__avg_down_int16(, ) { + %a32 = sext %0 to + %b32 = sext %1 to + %sum = add %a32, %b32 + %avg = sdiv %sum, < forloop(i, 1, eval(WIDTH-1), `i32 2, ') i32 2 > + %r = trunc %avg to + ret %r +}') + +define(`define_up_avgs', ` +define_avg_up_uint8() +define_avg_up_int8() +define_avg_up_uint16() +define_avg_up_int16() +') + +define(`define_down_avgs', ` +define_avg_down_uint8() +define_avg_down_int8() +define_avg_down_uint16() +define_avg_down_int16() +') + +define(`define_avgs', ` +define_up_avgs() +define_down_avgs() +') diff --git a/builtins/util.m4 b/builtins/util.m4 index fbd929a1..7f08adb3 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -4541,3 +4541,60 @@ define(`rcpd_decl', ` declare double @__rcp_uniform_double(double) declare @__rcp_varying_double() ') + +define(`declare_nvptx', +` +declare i32 @__program_index() nounwind readnone alwaysinline +declare i32 @__program_count() nounwind readnone alwaysinline +declare i32 @__warp_index() nounwind readnone alwaysinline +declare i32 @__task_index0() nounwind readnone alwaysinline +declare i32 @__task_index1() nounwind readnone alwaysinline +declare i32 @__task_index2() nounwind readnone alwaysinline +declare i32 @__task_index() nounwind readnone alwaysinline +declare i32 @__task_count0() nounwind readnone alwaysinline +declare i32 @__task_count1() nounwind readnone alwaysinline +declare i32 @__task_count2() nounwind readnone alwaysinline +declare i32 @__task_count() nounwind readnone alwaysinline +declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline +declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline +declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline +declare i64 @__movmsk_ptx() nounwind readnone alwaysinline; +') + +define(`global_atomic_varying',` +declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline +') + +define(`global_atomic_cas_varying',` +declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline +') + +global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32) +global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64) +global_atomic_cas_varying(WIDTH, compare_exchange, float, float) +global_atomic_cas_varying(WIDTH, compare_exchange, double, double) + +global_atomic_varying(WIDTH, swap, i32, int32) +global_atomic_varying(WIDTH, swap, i64, int64) +global_atomic_varying(WIDTH, swap, float, float) +global_atomic_varying(WIDTH, swap, double, double) + +global_atomic_varying(WIDTH, add, i32, int32) +global_atomic_varying(WIDTH, sub, i32, int32) +global_atomic_varying(WIDTH, and, i32, int32) +global_atomic_varying(WIDTH, or, i32, int32) +global_atomic_varying(WIDTH, xor, i32, int32) +global_atomic_varying(WIDTH, min, i32, int32) +global_atomic_varying(WIDTH, max, i32, int32) +global_atomic_varying(WIDTH, umin, i32, uint32) +global_atomic_varying(WIDTH, umax, i32, uint32) + +global_atomic_varying(WIDTH, add, i64, int64) +global_atomic_varying(WIDTH, sub, i64, int64) +global_atomic_varying(WIDTH, and, i64, int64) +global_atomic_varying(WIDTH, or, i64, int64) +global_atomic_varying(WIDTH, xor, i64, int64) +global_atomic_varying(WIDTH, min, i64, int64) +global_atomic_varying(WIDTH, max, i64, int64) +global_atomic_varying(WIDTH, umin, i64, uint64) +global_atomic_varying(WIDTH, umax, i64, uint64) diff --git a/ctx.cpp b/ctx.cpp index 6ff26c6a..1097a422 100644 --- a/ctx.cpp +++ b/ctx.cpp @@ -57,6 +57,8 @@ #include #include #endif +#include +#include /** This is a small utility structure that records information related to one level of nested control flow. It's mostly used in correctly restoring @@ -1371,29 +1373,97 @@ FunctionEmitContext::None(llvm::Value *mask) { llvm::Value * -FunctionEmitContext::LaneMask(llvm::Value *v) { - // Call the target-dependent movmsk function to turn the vector mask - // into an i64 value - std::vector mm; - m->symbolTable->LookupFunction("__movmsk", &mm); - if (g->target->getMaskBitCount() == 1) - AssertPos(currentPos, mm.size() == 1); - else - // There should be one with signed int signature, one unsigned int. - AssertPos(currentPos, mm.size() == 2); - // We can actually call either one, since both are i32s as far as - // LLVM's type system is concerned... - llvm::Function *fmm = mm[0]->function; - return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); +FunctionEmitContext::LaneMask(llvm::Value *v) +{ +#if 1 /* this makes mandelbrot example slower, why ?!? */ + const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk"; +#else + const char *__movmsk = "__movmsk"; +#endif + // Call the target-dependent movmsk function to turn the vector mask + // into an i64 value + std::vector mm; + m->symbolTable->LookupFunction(__movmsk, &mm); + if (g->target->getMaskBitCount() == 1) + AssertPos(currentPos, mm.size() == 1); + else + // There should be one with signed int signature, one unsigned int. + AssertPos(currentPos, mm.size() == 2); + // We can actually call either one, since both are i32s as far as + // LLVM's type system is concerned... + llvm::Function *fmm = mm[0]->function; + return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk")); +} + +bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName) +{ + llvm::Type *type = vector->getType(); + if (type == LLVMTypes::Int8VectorType) + funcName += "_int8"; + else if (type == LLVMTypes::Int16VectorType) + funcName += "_int16"; + else if (type == LLVMTypes::Int32VectorType) + funcName += "_int32"; + else if (type == LLVMTypes::Int64VectorType) + funcName += "_int64"; + else if (type == LLVMTypes::FloatVectorType) + funcName += "_float"; + else if (type == LLVMTypes::DoubleVectorType) + funcName += "_double"; + else + return false; + return true; +} + +llvm::Value* +FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar) +{ + std::string funcName = "__insert"; + assert(lAppendInsertExtractName(vector, funcName)); + assert(lane->getType() == LLVMTypes::Int32Type); + + llvm::Function *func = m->module->getFunction(funcName.c_str()); + assert(func != NULL); + std::vector args; + args.push_back(vector); + args.push_back(lane); + args.push_back(scalar); + llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock()); + return ret; +} + +llvm::Value* +FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane) +{ + std::string funcName = "__extract"; + assert(lAppendInsertExtractName(vector, funcName)); + assert(lane->getType() == LLVMTypes::Int32Type); + + llvm::Function *func = m->module->getFunction(funcName.c_str()); + assert(func != NULL); + std::vector args; + args.push_back(vector); + args.push_back(lane); + llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock()); + return ret; } llvm::Value * FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { + if (g->target->getISA() == Target::NVPTX) + { + // Compare the two masks to get a vector of i1s + llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + v1, v2, "v1==v2"); + return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */ + } + else + { #if 0 // Compare the two masks to get a vector of i1s llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - v1, v2, "v1==v2"); + v1, v2, "v1==v2"); // Turn that into a bool vector type (often i32s) cmp = I1VecToBoolVec(cmp); // And see if it's all on @@ -1402,22 +1472,34 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { llvm::Value *mm1 = LaneMask(v1); llvm::Value *mm2 = LaneMask(v2); return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2, - LLVMGetName("equal", v1, v2)); + LLVMGetName("equal", v1, v2)); #endif + } } llvm::Value * FunctionEmitContext::ProgramIndexVector(bool is32bits) { llvm::SmallVector array; for (int i = 0; i < g->target->getVectorWidth() ; ++i) { - llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i); - array.push_back(C); + llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i); + array.push_back(C); } llvm::Constant* index = llvm::ConstantVector::get(array); return index; } +llvm::Value * +FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) { + llvm::Function *func_program_index = m->module->getFunction("__program_index"); + llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector(), "foreach__program_indexS"); + llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV"); +#if 0 + if (!is32bits) + index = ZExtInst(index, LLVMTypes::Int64VectandType); +#endif + return index; +} llvm::Value * @@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) { if (name == NULL) name = LLVMGetName(value, "_ptr2int"); + llvm::Type *type = LLVMTypes::PointerIntType; llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock); AddDebugPos(inst); @@ -3523,98 +3606,199 @@ llvm::Value * FunctionEmitContext::LaunchInst(llvm::Value *callee, std::vector &argVals, llvm::Value *launchCount[3]){ - if (callee == NULL) { + + if (g->target->getISA() != Target::NVPTX) + { + if (callee == NULL) { AssertPos(currentPos, m->errorCount > 0); return NULL; - } + } - launchedTasks = true; + launchedTasks = true; - AssertPos(currentPos, llvm::isa(callee)); - llvm::Type *argType = + AssertPos(currentPos, llvm::isa(callee)); + llvm::Type *argType = (llvm::dyn_cast(callee))->arg_begin()->getType(); - AssertPos(currentPos, llvm::PointerType::classof(argType)); - llvm::PointerType *pt = + AssertPos(currentPos, llvm::PointerType::classof(argType)); + llvm::PointerType *pt = llvm::dyn_cast(argType); - AssertPos(currentPos, llvm::StructType::classof(pt->getElementType())); - llvm::StructType *argStructType = + AssertPos(currentPos, llvm::StructType::classof(pt->getElementType())); + llvm::StructType *argStructType = static_cast(pt->getElementType()); - llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); - AssertPos(currentPos, falloc != NULL); - llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); - if (structSize->getType() != LLVMTypes::Int64Type) + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); + AssertPos(currentPos, falloc != NULL); + llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); + if (structSize->getType() != LLVMTypes::Int64Type) // ISPCAlloc expects the size as an uint64_t, but on 32-bit // targets, SizeOf returns a 32-bit value structSize = ZExtInst(structSize, LLVMTypes::Int64Type, - "struct_size_to_64"); - int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth()); + "struct_size_to_64"); + int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth()); - std::vector allocArgs; - allocArgs.push_back(launchGroupHandlePtr); - allocArgs.push_back(structSize); - allocArgs.push_back(LLVMInt32(align)); - llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); - llvm::Value *argmem = BitCastInst(voidmem, pt); + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(structSize); + allocArgs.push_back(LLVMInt32(align)); + llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); + llvm::Value *argmem = BitCastInst(voidmem, pt); - // Copy the values of the parameters into the appropriate place in - // the argument block - for (unsigned int i = 0; i < argVals.size(); ++i) { + // Copy the values of the parameters into the appropriate place in + // the argument block + for (unsigned int i = 0; i < argVals.size(); ++i) { llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); // don't need to do masked store here, I think StoreInst(argVals[i], ptr); - } + } - if (argStructType->getNumElements() == argVals.size() + 1) { + if (argStructType->getNumElements() == argVals.size() + 1) { // copy in the mask llvm::Value *mask = GetFullMask(); llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL, - "funarg_mask"); + "funarg_mask"); StoreInst(mask, ptr); - } + } - // And emit the call to the user-supplied task launch function, passing - // a pointer to the task function being called and a pointer to the - // argument block we just filled in - llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); - llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); - AssertPos(currentPos, flaunch != NULL); - std::vector args; - args.push_back(launchGroupHandlePtr); - args.push_back(fptr); - args.push_back(voidmem); - args.push_back(launchCount[0]); - args.push_back(launchCount[1]); - args.push_back(launchCount[2]); - return CallInst(flaunch, NULL, args, ""); + // And emit the call to the user-supplied task launch function, passing + // a pointer to the task function being called and a pointer to the + // argument block we just filled in + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + AssertPos(currentPos, flaunch != NULL); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); + return CallInst(flaunch, NULL, args, ""); + } + else /* NVPTX */ + { + if (callee == NULL) { + AssertPos(currentPos, m->errorCount > 0); + return NULL; + } + launchedTasks = true; + + AssertPos(currentPos, llvm::isa(callee)); + std::vector argTypes; + + llvm::Function *F = llvm::dyn_cast(callee); + const unsigned int nArgs = F->arg_size(); + llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + for (; I != E; ++I) + argTypes.push_back(I->getType()); + llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes); + llvm::StructType *argStructType = static_cast(st); + llvm::Value *structSize = g->target->SizeOf(argStructType, bblock); + if (structSize->getType() != LLVMTypes::Int64Type) + structSize = ZExtInst(structSize, LLVMTypes::Int64Type, + "struct_size_to_64"); + + const int align = 8; + llvm::Function *falloc = m->module->getFunction("ISPCAlloc"); + AssertPos(currentPos, falloc != NULL); + std::vector allocArgs; + allocArgs.push_back(launchGroupHandlePtr); + allocArgs.push_back(structSize); + allocArgs.push_back(LLVMInt32(align)); + llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr"); + llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64"); + llvm::BasicBlock* if_true = CreateBasicBlock("if_true"); + llvm::BasicBlock* if_false = CreateBasicBlock("if_false"); + + /* check if the pointer returned by ISPCAlloc is not NULL + * -------------- + * this is a workaround for not checking the value of programIndex + * because ISPCAlloc will return NULL pointer for all programIndex > 0 + * of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0 + * will also be NULL + * This check must be added, and also rewrite the code to make it less opaque + */ + llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1"); + BranchInst(if_true, if_false, cmp1); + + /**********************/ + bblock = if_true; + + // label_if_then block: + llvm::Type *pt = llvm::PointerType::getUnqual(st); + llvm::Value *argmem = BitCastInst(voidmem, pt); + for (unsigned int i = 0; i < argVals.size(); ++i) + { + llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg"); + // don't need to do masked store here, I think + StoreInst(argVals[i], ptr); + } + if (nArgs == argVals.size() + 1) { + // copy in the mask + llvm::Value *mask = GetFullMask(); + llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL, + "funarg_mask"); + StoreInst(mask, ptr); + } + BranchInst(if_false); + + /**********************/ + bblock = if_false; + + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + AssertPos(currentPos, flaunch != NULL); + std::vector args; + args.push_back(launchGroupHandlePtr); + args.push_back(fptr); + args.push_back(voidmem); + args.push_back(launchCount[0]); + args.push_back(launchCount[1]); + args.push_back(launchCount[2]); + llvm::Value *ret = CallInst(flaunch, NULL, args, ""); + return ret; + } } void FunctionEmitContext::SyncInst() { - llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); - llvm::Value *nullPtrValue = + if (g->target->getISA() != Target::NVPTX) + { + llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); + llvm::Value *nullPtrValue = llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); - llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, - llvm::CmpInst::ICMP_NE, - launchGroupHandle, nullPtrValue); - llvm::BasicBlock *bSync = CreateBasicBlock("call_sync"); - llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync"); - BranchInst(bSync, bPostSync, nonNull); + llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp, + llvm::CmpInst::ICMP_NE, + launchGroupHandle, nullPtrValue); + llvm::BasicBlock *bSync = CreateBasicBlock("call_sync"); + llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync"); + BranchInst(bSync, bPostSync, nonNull); - SetCurrentBasicBlock(bSync); - llvm::Function *fsync = m->module->getFunction("ISPCSync"); - if (fsync == NULL) + SetCurrentBasicBlock(bSync); + llvm::Function *fsync = m->module->getFunction("ISPCSync"); + if (fsync == NULL) FATAL("Couldn't find ISPCSync declaration?!"); - CallInst(fsync, NULL, launchGroupHandle, ""); + CallInst(fsync, NULL, launchGroupHandle, ""); - // zero out the handle so that if ISPCLaunch is called again in this - // function, it knows it's starting out from scratch - StoreInst(nullPtrValue, launchGroupHandlePtr); + // zero out the handle so that if ISPCLaunch is called again in this + // function, it knows it's starting out from scratch + StoreInst(nullPtrValue, launchGroupHandlePtr); - BranchInst(bPostSync); + BranchInst(bPostSync); - SetCurrentBasicBlock(bPostSync); + SetCurrentBasicBlock(bPostSync); + } + else /* NVPTX: don't do test, just call sync */ + { + llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr); + llvm::Value *nullPtrValue = + llvm::Constant::getNullValue(LLVMTypes::VoidPointerType); + llvm::Function *fsync = m->module->getFunction("ISPCSync"); + if (fsync == NULL) + FATAL("Couldn't find ISPCSync declaration?!"); + CallInst(fsync, NULL, launchGroupHandle, ""); + StoreInst(nullPtrValue, launchGroupHandlePtr); + } } diff --git a/ctx.h b/ctx.h index 4dd30053..57160c17 100644 --- a/ctx.h +++ b/ctx.h @@ -291,6 +291,13 @@ public: of the mask is on. */ llvm::Value *LaneMask(llvm::Value *mask); + + /** Issues a call to __insert_int8/int16/int32/int64/float/double */ + llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar); + /** Issues a call to __extract_int8/int16/int32/int64/float/double */ + llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane); + + /** Given two masks of type LLVMTypes::MaskType, return an i1 value that indicates whether the two masks are equal. */ llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2); @@ -298,6 +305,7 @@ public: /** Generate ConstantVector, which contains ProgramIndex, i.e. < i32 0, i32 1, i32 2, i32 3> */ llvm::Value *ProgramIndexVector(bool is32bits = true); + llvm::Value *ProgramIndexVectorPTX(bool is32bits = true); /** Given a string, create an anonymous global variable to hold its value and return the pointer to the string. */ diff --git a/decl.cpp b/decl.cpp index 8a10543b..27a6d580 100644 --- a/decl.cpp +++ b/decl.cpp @@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const { retType = lApplyTypeQualifiers(typeQualifiers, retType, pos); if (soaWidth > 0) { +#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */ + if (g->target->getISA() == Target::NVPTX) + { + Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target."); + return NULL; + } +#endif const StructType *st = CastType(retType); if (st == NULL) { @@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { return; } +#if 0 /* NVPTX */ + if (baseType->IsUniformType()) + { + fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize, + baseType->IsArrayType() ? " true " : " false "); + } +#endif const Type *arrayType = new ArrayType(baseType, arraySize); if (child != NULL) { child->InitFromType(arrayType, ds); @@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { returnType = returnType->ResolveUnboundVariability(Variability::Varying); + bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0); bool isExternC = ds && (ds->storageClass == SC_EXTERN_C); bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0); - bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0); bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0); if (isExported && isTask) { @@ -541,9 +555,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) { return; } if (isExternC && isTask) { - Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " - "qualifiers"); - return; + Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" " + "qualifiers"); + return; } if (isExternC && isExported) { Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" " diff --git a/expr.cpp b/expr.cpp index b5c876fd..4a473fe7 100644 --- a/expr.cpp +++ b/expr.cpp @@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() { "struct type \"%s\".", type->GetString().c_str()); return NULL; } + if (type != NULL) + if (g->target->getISA() == Target::NVPTX && type->IsVaryingType()) + { + Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target."); + return NULL; + } return this; } @@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() { AssertPos(pos, m->errorCount > 0); return NULL; } + if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType()) + { + Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target."); + return NULL; + } if (CastType(allocType) != NULL) { Error(pos, "Can't dynamically allocate storage for declared " "but not defined type \"%s\".", allocType->GetString().c_str()); diff --git a/func.cpp b/func.cpp index 76ae43f5..578dd68a 100644 --- a/func.cpp +++ b/func.cpp @@ -47,6 +47,7 @@ #include #if defined(LLVM_3_1) || defined(LLVM_3_2) + #include #include #include #include @@ -54,6 +55,7 @@ #include #include #else + #include #include #include #include @@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) { sym->parentFunction = this; } - if (type->isTask) { + if (type->isTask && g->target->getISA() != Target::NVPTX) { threadIndexSym = m->symbolTable->LookupVariable("threadIndex"); Assert(threadIndexSym); threadCountSym = m->symbolTable->LookupVariable("threadCount"); @@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, #endif const FunctionType *type = CastType(sym->type); Assert(type != NULL); - if (type->isTask == true) { + if (type->isTask == true && g->target->getISA() != Target::NVPTX) { // For tasks, there should always be three parameters: the // pointer to the structure that holds all of the arguments, the // thread index, and the thread count variables. @@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, ctx->SetFunctionMask(argIter); Assert(++argIter == function->arg_end()); } + if (type->isTask == true && g->target->getISA() == Target::NVPTX) + { + llvm::NamedMDNode* annotations = + m->module->getOrInsertNamedMetadata("nvvm.annotations"); + llvm::SmallVector av; + av.push_back(function); + av.push_back(llvm::MDString::get(*g->ctx, "kernel")); + av.push_back(LLVMInt32(1)); + annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); + } } // Finally, we can generate code for the function @@ -492,13 +504,28 @@ Function::GenerateIR() { // the application can call it const FunctionType *type = CastType(sym->type); Assert(type != NULL); - if (type->isExported) { + if (type->isExported) { if (!type->isTask) { llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true); llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage; std::string functionName = sym->name; + if (g->mangleFunctionsWithTarget) functionName += std::string("_") + g->target->GetISAString(); + + if (g->target->getISA() == Target::NVPTX) + { + functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */ +#if 0 + llvm::NamedMDNode* annotations = + m->module->getOrInsertNamedMetadata("nvvm.annotations"); + llvm::SmallVector av; + av.push_back(function); + av.push_back(llvm::MDString::get(*g->ctx, "kernel")); + av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); + annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); +#endif + } llvm::Function *appFunction = llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module); #if defined(LLVM_3_1) @@ -538,6 +565,16 @@ Function::GenerateIR() { FATAL("Function verificication failed"); } } + if (g->target->getISA() == Target::NVPTX) + { + llvm::NamedMDNode* annotations = + m->module->getOrInsertNamedMetadata("nvvm.annotations"); + llvm::SmallVector av; + av.push_back(appFunction); + av.push_back(llvm::MDString::get(*g->ctx, "kernel")); + av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1)); + annotations->addOperand(llvm::MDNode::get(*g->ctx, av)); + } } } } diff --git a/ispc.cpp b/ispc.cpp index 1386d65e..bd973517 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : arch = "arm"; else #endif + if(!strncmp(isa, "nvptx", 5)) + arch = "nvptx64"; + else arch = "x86-64"; } @@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : this->m_maskBitCount = 32; } #endif + else if (!strcasecmp(isa, "nvptx")) + { + this->m_isa = Target::NVPTX; + this->m_cpu = "sm_35"; + this->m_nativeVectorWidth = 32; + this->m_nativeVectorAlignment = 32; + this->m_vectorWidth = 1; + this->m_hasHalf = true; + this->m_maskingIsFree = true; + this->m_maskBitCount = 1; + this->m_hasTranscendentals = false; + this->m_hasGather = this->m_hasScatter = false; + } else { Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.", isa, SupportedTargets()); @@ -784,7 +800,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) : // Initialize target-specific "target-feature" attribute. if (!m_attributes.empty()) { llvm::AttrBuilder attrBuilder; - attrBuilder.addAttribute("target-cpu", this->m_cpu); + if (m_isa != Target::NVPTX) + attrBuilder.addAttribute("target-cpu", this->m_cpu); attrBuilder.addAttribute("target-features", this->m_attributes); this->m_tf_attributes = new llvm::AttributeSet( llvm::AttributeSet::get( @@ -839,7 +856,7 @@ Target::SupportedTargets() { "avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 " "avx2-i32x8, avx2-i32x16, avx2-i64x4, " "generic-x1, generic-x4, generic-x8, generic-x16, " - "generic-x32, generic-x64"; + "generic-x32, generic-x64, nvptx"; } @@ -866,6 +883,8 @@ Target::GetTripleString() const { triple.setArchName("i386"); else if (m_arch == "x86-64") triple.setArchName("x86_64"); + else if (m_arch == "nvptx64") + triple = llvm::Triple("nvptx64", "nvidia", "cuda"); else triple.setArchName(m_arch); } @@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) { return "avx2"; case Target::GENERIC: return "generic"; + case Target::NVPTX: + return "nvptx"; default: FATAL("Unhandled target in ISAToString()"); } @@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) { return "avx2-i32x8"; case Target::GENERIC: return "generic-4"; + case Target::NVPTX: + return "nvptx"; default: FATAL("Unhandled target in ISAToTargetString()"); } diff --git a/ispc.h b/ispc.h index 4b6df8c3..ffe9739c 100644 --- a/ispc.h +++ b/ispc.h @@ -179,7 +179,7 @@ public: flexible/performant of them will apear last in the enumerant. Note also that __best_available_isa() needs to be updated if ISAs are added or the enumerant values are reordered. */ - enum ISA { + enum ISA { NVPTX, #ifdef ISPC_ARM_ENABLED NEON32, NEON16, NEON8, #endif @@ -606,6 +606,7 @@ struct Globals { /** Indicates that alignment in memory allocation routines should be forced to have given value. -1 value means natural alignment for the platforms. */ int forceAlignment; + std::string PtxString; }; enum { diff --git a/main.cpp b/main.cpp index 99497af5..2815cde9 100644 --- a/main.cpp +++ b/main.cpp @@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) { LLVMInitializeARMTargetMC(); #endif + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXAsmPrinter(); + LLVMInitializeNVPTXTargetMC(); + char *file = NULL; const char *headerFileName = NULL; const char *outFileName = NULL; diff --git a/module.cpp b/module.cpp index 94682dc0..a8f521d8 100644 --- a/module.cpp +++ b/module.cpp @@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE return; } + if (g->target->getISA() == Target::NVPTX && +#if 0 + !type->IsConstType() && +#endif +#if 1 + at != NULL && +#endif + type->IsVaryingType()) + { + Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target."); + return; +#if 0 + int nel = 32; /* warp-size */ + if (type->IsArrayType()) + { + const ArrayType *at = CastType(type); + /* we must scale # elements by 4, because a thread-block will run 4 warps + * or 128 threads. + * ***note-to-me***:please define these value (128threads/4warps) + * in nvptx-target definition + * instead of compile-time constants + */ + nel *= at->GetElementCount(); + assert (!type->IsSOAType()); + type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel); + } + else + type = new ArrayType(type->GetAsUniformType(), nel); +#endif + } + + llvm::Type *llvmType = type->LLVMType(g->ctx); if (llvmType == NULL) return; @@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name, } } +static void +lCheckTaskParameterTypes(const Type *type, const std::string &name, + SourcePos pos) { + if (g->target->getISA() != Target::NVPTX) + return; + if (lRecursiveCheckValidParamType(type, false) == false) { + if (CastType(type)) + Error(pos, "Vector-typed parameter \"%s\" is illegal in a task " + "function with \"nvptx\" target.", name.c_str()); + else + Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.", + name.c_str()); + } +} + /** Given a function type, loop through the function parameters and see if any are StructTypes. If so, issue an error; this is currently broken @@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name, #else // LLVM 3.1 and 3.3+ function->addFnAttr(llvm::Attribute::AlwaysInline); #endif - if (functionType->isTask) + /* evghenii: fails function verification when "if" executed in nvptx target */ + if (functionType->isTask && g->target->getISA() != Target::NVPTX) // This also applies transitively to members I think? #if defined(LLVM_3_1) function->setDoesNotAlias(1, true); @@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name, Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false) Error(pos, "Task-qualified functions must have void return type."); + if (g->target->getISA() == Target::NVPTX && + Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false && + functionType->isExported) + { + Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target."); + } + if (functionType->isExported || functionType->isExternC) lCheckForStructParameters(functionType, pos); @@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name, if (functionType->isExported) { lCheckExportedParameterTypes(argType, argName, argPos); } + if (functionType->isTask) { + lCheckTaskParameterTypes(argType, argName, argPos); + } // ISPC assumes that no pointers alias. (It should be possible to // specify when this is not the case, but this should be the @@ -959,7 +1017,13 @@ Module::writeOutput(OutputType outputType, const char *outFileName, const char *fileType = NULL; switch (outputType) { case Asm: - if (strcasecmp(suffix, "s")) + if (g->target->getISA() != Target::NVPTX) + { + if (strcasecmp(suffix, "s")) + fileType = "assembly"; + } + else + if (strcasecmp(suffix, "ptx")) fileType = "assembly"; break; case Bitcode: @@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) { } llvm::raw_fd_ostream fos(fd, (fd != 1), false); + if (g->target->getISA() == Target::NVPTX) + { + const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"; + module->setDataLayout(dl_string); + } llvm::WriteBitcodeToFile(module, fos); return true; } @@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre opts.addMacroDef(g->cppArgs[i].substr(2)); } } + if (g->target->getISA() == Target::NVPTX) + { + opts.addMacroDef("__NVPTX__"); + opts.addMacroDef("programIndex=__programIndex()"); + opts.addMacroDef("cif=if"); + opts.addMacroDef("cfor=for"); + opts.addMacroDef("cwhile=while"); + opts.addMacroDef("ccontinue=continue"); + opts.addMacroDef("cdo=do"); + opts.addMacroDef("taskIndex0=__taskIndex0()"); + opts.addMacroDef("taskIndex1=__taskIndex1()"); + opts.addMacroDef("taskIndex2=__taskIndex2()"); + opts.addMacroDef("taskIndex=__taskIndex()"); + opts.addMacroDef("taskCount0=__taskCount0()"); + opts.addMacroDef("taskCount1=__taskCount1()"); + opts.addMacroDef("taskCount2=__taskCount2()"); + opts.addMacroDef("taskCount=__taskCount()"); + } #if defined(LLVM_3_1) inst.getLangOpts().BCPLComment = 1; @@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map &functions) return module; } +static std::string lCBEMangle(const std::string &S) { + std::string Result; + + for (unsigned i = 0, e = S.size(); i != e; ++i) { + if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') || + (S[i] == '<' && S[i+1] == '<'))) { + Result += '_'; + Result += 'A'+(S[i]&15); + Result += 'A'+((S[i]>>4)&15); + Result += '_'; + i++; + } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') { + Result += S[i]; + } else { + Result += '_'; + Result += 'A'+(S[i]&15); + Result += 'A'+((S[i]>>4)&15); + Result += '_'; + } + } + return Result; +} + int Module::CompileAndOutput(const char *srcFile, @@ -2555,7 +2665,7 @@ Module::CompileAndOutput(const char *srcFile, const char *hostStubFileName, const char *devStubFileName) { - if (target == NULL || strchr(target, ',') == NULL) { + if (target == NULL || strchr(target, ',') == NULL) { // We're only compiling to a single target g->target = new Target(arch, cpu, target, generatePIC); if (!g->target->isValid()) @@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile, m = new Module(srcFile); if (m->CompileFile() == 0) { + + /* NVPTX: + * for PTX target replace '.' with '_' in all global variables + * a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]* + */ + if (g->target->getISA() == Target::NVPTX) + { + /* mangle global variables names */ + { + llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end(); + for (; I != E; I++) + I->setName(lCBEMangle(I->getName())); + } + + /* mangle functions names */ + { + llvm::Module::iterator I = m->module->begin(), E = m->module->end(); + for (; I != E; I++) + { + std::string str = I->getName(); + if (str.find("operator") != std::string::npos) + I->setName(lCBEMangle(str)); + } + } + } + if (outputType == CXX) { if (target == NULL || strncmp(target, "generic-", 8) != 0) { Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" " @@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile, return errorCount > 0; } + return true; } diff --git a/nvptxcc b/nvptxcc new file mode 100755 index 00000000..81d622e9 --- /dev/null +++ b/nvptxcc @@ -0,0 +1,20 @@ +#!/bin/sh + +PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH +PTXCC=ptxcc +ARGS=${@:2} +if [ "$NVVM" == "1" ]; +then + LLVM32=$HOME/usr/local/llvm/bin-3.2 + LLVMDIS=$LLVM32/bin/llvm-dis + PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen + $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \ + $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \ + $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS); +else + $($PTXCC $1 -o $1.o -Xnvcc="-G") && \ + $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS); +fi + + + diff --git a/opt.cpp b/opt.cpp index 9c66ade1..a54805db 100644 --- a/opt.cpp +++ b/opt.cpp @@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output); static llvm::Pass *CreateReplaceStdlibShiftPass(); static llvm::Pass *CreateFixBooleanSelectPass(); +static llvm::Pass *CreatePromoteLocalToPrivatePass(); #define DEBUG_START_PASS(NAME) \ if (g->debugPrint && \ @@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) { // run absolutely no optimizations, since the front-end needs us to // take the various __pseudo_* functions it has emitted and turn // them into something that can actually execute. - optPM.add(CreateImproveMemoryOpsPass(), 100); + + if (g->opt.disableGatherScatterOptimizations == false && + g->target->getVectorWidth() > 1) + optPM.add(CreateImproveMemoryOpsPass(), 100); + if (g->opt.disableHandlePseudoMemoryOps == false) optPM.add(CreateReplacePseudoMemoryOpsPass()); @@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) { llvm::initializeInstrumentation(*registry); llvm::initializeTarget(*registry); + if (g->target->getISA() == Target::NVPTX) + optPM.add(CreatePromoteLocalToPrivatePass()); optPM.add(llvm::createGlobalDCEPass(), 185); // Setup to use LLVM default AliasAnalysis @@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) { optPM.add(llvm::createGlobalOptimizerPass()); optPM.add(llvm::createReassociatePass()); optPM.add(llvm::createIPConstantPropagationPass()); - optPM.add(CreateReplaceStdlibShiftPass(),229); + + if (g->target->getISA() != Target::NVPTX) + optPM.add(CreateReplaceStdlibShiftPass(),229); + optPM.add(llvm::createDeadArgEliminationPass(),230); optPM.add(llvm::createInstructionCombiningPass()); optPM.add(llvm::createCFGSimplificationPass()); @@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) { // Should be the last optPM.add(CreateFixBooleanSelectPass(), 400); + + if (g->target->getISA() == Target::NVPTX) + { + optPM.add(llvm::createGlobalDCEPass()); + + optPM.add(llvm::createTypeBasedAliasAnalysisPass()); + optPM.add(llvm::createBasicAliasAnalysisPass()); + optPM.add(llvm::createCFGSimplificationPass()); + // Here clang has an experimental pass SROAPass instead of + // ScalarReplAggregatesPass. We should add it in the future. + optPM.add(llvm::createScalarReplAggregatesPass()); + optPM.add(llvm::createEarlyCSEPass()); + optPM.add(llvm::createLowerExpectIntrinsicPass()); + optPM.add(llvm::createTypeBasedAliasAnalysisPass()); + optPM.add(llvm::createBasicAliasAnalysisPass()); + + // Early optimizations to try to reduce the total amount of code to + // work with if we can + optPM.add(llvm::createReassociatePass()); + optPM.add(llvm::createConstantPropagationPass()); + optPM.add(llvm::createDeadInstEliminationPass()); + optPM.add(llvm::createCFGSimplificationPass()); + + optPM.add(llvm::createPromoteMemoryToRegisterPass()); + optPM.add(llvm::createAggressiveDCEPass()); + + + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createDeadInstEliminationPass()); + + // On to more serious optimizations + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createPromoteMemoryToRegisterPass()); + optPM.add(llvm::createGlobalOptimizerPass()); + optPM.add(llvm::createReassociatePass()); + optPM.add(llvm::createIPConstantPropagationPass()); + + optPM.add(llvm::createDeadArgEliminationPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createPruneEHPass()); + optPM.add(llvm::createFunctionAttrsPass()); + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createConstantPropagationPass()); + optPM.add(llvm::createDeadInstEliminationPass()); + optPM.add(llvm::createCFGSimplificationPass()); + + optPM.add(llvm::createArgumentPromotionPass()); +#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3) + // Starting from 3.4 this functionality was moved to + // InstructionCombiningPass. See r184459 for details. + optPM.add(llvm::createSimplifyLibCallsPass()); +#endif + optPM.add(llvm::createAggressiveDCEPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createJumpThreadingPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createTailCallEliminationPass()); + + optPM.add(llvm::createInstructionCombiningPass()); + + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createConstantPropagationPass()); + + optPM.add(llvm::createInstructionCombiningPass()); + + optPM.add(llvm::createIPSCCPPass()); + optPM.add(llvm::createDeadArgEliminationPass()); + optPM.add(llvm::createAggressiveDCEPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createCFGSimplificationPass()); + + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createArgumentPromotionPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createReassociatePass()); + optPM.add(llvm::createLoopRotatePass()); + optPM.add(llvm::createLICMPass()); +// optPM.add(llvm::createLoopUnswitchPass(false)); +#if 1 + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createIndVarSimplifyPass()); + optPM.add(llvm::createLoopIdiomPass()); + optPM.add(llvm::createLoopDeletionPass()); + optPM.add(llvm::createLoopUnrollPass()); + optPM.add(llvm::createGVNPass()); + optPM.add(llvm::createMemCpyOptPass()); + optPM.add(llvm::createSCCPPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createJumpThreadingPass()); + optPM.add(llvm::createCorrelatedValuePropagationPass()); + optPM.add(llvm::createDeadStoreEliminationPass()); + optPM.add(llvm::createAggressiveDCEPass()); + optPM.add(llvm::createCFGSimplificationPass()); + optPM.add(llvm::createInstructionCombiningPass()); + optPM.add(llvm::createFunctionInliningPass()); + optPM.add(llvm::createAggressiveDCEPass()); + optPM.add(llvm::createStripDeadPrototypesPass()); + optPM.add(llvm::createGlobalDCEPass()); + optPM.add(llvm::createConstantMergePass()); +#endif + } } // Finish up by making sure we didn't mess anything up in the IR along @@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() { return new FixBooleanSelectPass(); } +/////////////////////////////////////////////////////////////////////////////// +// Detect addrspace(3) +/////////////////////////////////////////////////////////////////////////////// + +class PromoteLocalToPrivatePass: public llvm::BasicBlockPass +{ + public: + static char ID; // Pass identification, replacement for typeid + PromoteLocalToPrivatePass() : BasicBlockPass(ID) {} + + bool runOnBasicBlock(llvm::BasicBlock &BB); +}; + +char PromoteLocalToPrivatePass::ID = 0; + +bool +PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB) +{ + std::vector Allocas; + + bool modifiedAny = false; + + llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var"); + + // Find allocas that are safe to promote, by looking at all instructions in + // the entry node + for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) + { + llvm::Instruction *inst = &*I; + if (llvm::CallInst *ci = llvm::dyn_cast(inst)) + { + llvm::Function *func = ci->getCalledFunction(); + if (cvtFunc && (cvtFunc == func)) + { +#if 0 + fprintf(stderr , "--found cvt-- name= %s \n", + I->getName().str().c_str()); +#endif + llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci); + assert(alloca != NULL); +#if 0 + const int align = 8; // g->target->getNativeVectorAlignment(); + alloca->setAlignment(align); +#endif + ci->replaceAllUsesWith(alloca); + modifiedAny = true; + } + } + } + return modifiedAny; +} + +static llvm::Pass * +CreatePromoteLocalToPrivatePass() { + return new PromoteLocalToPrivatePass(); +} + + + diff --git a/ptxtestcc.sh b/ptxtestcc.sh new file mode 100755 index 00000000..2ba5e252 --- /dev/null +++ b/ptxtestcc.sh @@ -0,0 +1,14 @@ +#!/bin/sh +LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc +DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis + +ISPC=ispc +PTXCC=ptxcc +PTXGEN=~/ptxgen +$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \ +#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \ +$($PTXCC $1.ptx -Xptxas=-v -o $1.ptx.o) && \ +nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2 + + + diff --git a/run_tests.py b/run_tests.py index 89e6cd87..671ad416 100755 --- a/run_tests.py +++ b/run_tests.py @@ -204,6 +204,8 @@ def run_test(testname): return (1, 0) else: global is_generic_target + global is_nvptx_target + global is_nvptx_nvvm if is_windows: if is_generic_target: obj_name = "%s.cpp" % os.path.basename(filename) @@ -218,6 +220,13 @@ def run_test(testname): else: if is_generic_target: obj_name = "%s.cpp" % testname + elif is_nvptx_target: + if os.environ.get("NVVM") == "1": + is_nvptx_nvvm = True + obj_name = "%s.bc" % testname + else: + obj_name = "%s.ptx" % testname + is_nvptx_nvvm = False else: obj_name = "%s.o" % testname exe_name = "%s.run" % testname @@ -248,13 +257,32 @@ def run_test(testname): cc_cmd += ' -Wl,-no_pie' if should_fail: cc_cmd += " -DEXPECT_FAILURE" + if is_nvptx_target: + nvptxcc_exe = "nvptxcc" + nvptxcc_exe_rel = add_prefix(nvptxcc_exe) + cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \ + (nvptxcc_exe_rel, obj_name, match, exe_name) - ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \ + ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \ (filename, obj_name, options.arch, options.target) if options.no_opt: ispc_cmd += " -O0" if is_generic_target: ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file) + if is_nvptx_target: + filename4ptx = filename+".ptx.parsed_ispc" + grep_cmd = "grep -v 'export uniform int width' %s > %s " % \ + (filename, filename4ptx) + if options.verbose: + print "Grepping: %s" % grep_cmd + sp = subprocess.Popen(grep_cmd, shell=True) + sp.communicate() + if is_nvptx_nvvm: + ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \ + (filename4ptx, obj_name, options.target) + else: + ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \ + (filename4ptx, obj_name, options.target) # compile the ispc code, make the executable, and run it... (compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd], @@ -269,7 +297,7 @@ def run_test(testname): basename = os.path.basename(filename) os.unlink("%s.pdb" % basename) os.unlink("%s.ilk" % basename) - os.unlink(obj_name) +# os.unlink(obj_name) except: None @@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test ispc_exe = glob_var[3] global is_generic_target is_generic_target = glob_var[4] + global is_nvptx_target global run_tests_log run_tests_log = glob_var[5] @@ -505,6 +534,8 @@ def run_tests(options1, args, print_version): if options.target == 'neon': options.arch = 'arm' + if options.target == "nvptx": + options.arch = "nvptx64" # use relative path to not depend on host directory, which may possibly # have white spaces and unicode characters. @@ -530,9 +561,11 @@ def run_tests(options1, args, print_version): print_debug("Testing ispc: " + ispc_exe + "\n", s, run_tests_log) ispc_exe += " " + options.ispc_flags - global is_generic_target + global is_generic_target + global is_nvptx_target is_generic_target = (options.target.find("generic-") != -1 and options.target != "generic-1" and options.target != "generic-x1") + is_nvptx_target = (options.target.find("nvptx") != -1) if is_generic_target and options.include_file == None: if options.target == "generic-4" or options.target == "generic-x4": error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2) diff --git a/stdlib.ispc b/stdlib.ispc index 24217cd0..de0e32ed 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -57,6 +57,31 @@ #error Unknown value of ISPC_MASK_BITS #endif + + +/////////////////////////////////////////////////////////////////////////// +// CUDA Specific primitives +// +/***************/ + +__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); } +__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); } +__declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); } + +/***************/ + +__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); } +__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); } +__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); } +__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); } + +/***************/ + +__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); } +__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); } +__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); } +__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); } + /////////////////////////////////////////////////////////////////////////// // Low level primitives @@ -464,7 +489,10 @@ __declspec(safe) static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes #if (ISPC_MASK_BITS == 1) - return __popcnt_int64(__movmsk(v & __mask)); + if (__is_nvptx_target) + return __popcnt_int64(__movmsk_ptx(v & __mask)); + else + return __popcnt_int64(__movmsk(v & __mask)); #else return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask)); #endif @@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) { return __packed_store_active(a, vals, (IntMaskType)__mask); } +static inline uniform int +packed_store_active(bool active, uniform int a[], int vals) { + return __packed_store_active(a, vals, (IntMaskType)(-(int)active)); +} + static inline uniform int packed_store_active2(uniform int a[], int vals) { return __packed_store_active2(a, vals, (IntMaskType)__mask); @@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) { // System information static inline uniform int num_cores() { + if (__is_nvptx_target) + return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx) + else return __num_cores(); } @@ -1783,7 +1819,7 @@ static inline void memory_barrier() { __memory_barrier(); } -#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \ +#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \ return ret; \ @@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ return ret; \ } \ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1804,10 +1844,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } \ -#define DEFINE_ATOMIC_SWAP(TA,TB) \ +#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform int i = 0; \ TA ret[programCount]; \ TA memVal; \ @@ -1838,6 +1883,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \ originally got back from memory... */ \ ret[lastSwap] = memVal; \ return ret[programIndex]; \ + }\ } \ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ uniform TA value) { \ @@ -1845,6 +1891,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \ return ret; \ } \ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1855,9 +1905,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \ ret = insert(ret, i, r); \ } \ return ret; \ + }\ } \ -#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \ +#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \ static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \ uniform TA oneval = reduce_##OPA(value); \ TA ret; \ @@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \ } \ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ TA value) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } -DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min) -DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max) -DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType) -DEFINE_ATOMIC_SWAP(int32,int32) +DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64) +DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64) +DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax) -DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType) -DEFINE_ATOMIC_SWAP(unsigned int32,int32) +DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64) +DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64) -DEFINE_ATOMIC_SWAP(float,float) +DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64) -DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min) -DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max) -DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType) -DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType) -DEFINE_ATOMIC_SWAP(int64,int64) +DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64) +DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64) +DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64) +DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64) // For everything but atomic min and max, we can use the same // implementations for unsigned as for signed. -DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin) -DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax) -DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType) -DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType) -DEFINE_ATOMIC_SWAP(unsigned int64,int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64) +DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64) -DEFINE_ATOMIC_SWAP(double,double) +DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64) #undef DEFINE_ATOMIC_OP #undef DEFINE_ATOMIC_MINMAX_OP #undef DEFINE_ATOMIC_SWAP -#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \ +#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \ static inline uniform TA atomic_compare_exchange_global( \ uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \ uniform TA ret = \ @@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global( \ } \ static inline TA atomic_compare_exchange_global( \ uniform TA * varying ptr, TA oldval, TA newval) { \ + if (__is_nvptx_target) { \ + TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \ + return ret; \ + } else { \ uniform TA * uniform ptrArray[programCount]; \ ptrArray[programIndex] = ptr; \ TA ret; \ @@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global( \ ret = insert(ret, i, r); \ } \ return ret; \ + } \ } -ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType) -ATOMIC_DECL_CMPXCHG(float, float, IntMaskType) -ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType) -ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType) -ATOMIC_DECL_CMPXCHG(double, double, IntMaskType) +ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64) +ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64) +ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64) +ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64) #undef ATOMIC_DECL_CMPXCHG @@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value) } \ static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \ TYPE ret; \ + if (__is_nvptx_target) { \ + foreach_active (i) { \ + uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \ + ret = insert(ret, i, *ptr); \ + *ptr = OPFUNC(*ptr, extract(value, i)); \ + } \ + } else { \ uniform TYPE * uniform ptrs[programCount]; \ ptrs[programIndex] = p; \ foreach_active (i) { \ ret = insert(ret, i, *ptrs[i]); \ *ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \ } \ + } \ return ret; \ } diff --git a/stmt.cpp b/stmt.cpp index 52d25fe9..ee9e819c 100644 --- a/stmt.cpp +++ b/stmt.cpp @@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) { return lHasUnsizedArrays(at->GetElementType()); } +static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos, const bool variable = false) +{ + if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX) + return value; + llvm::PointerType *pt = llvm::dyn_cast(value->getType()); + const int addressSpace = pt->getAddressSpace(); + if (addressSpace != 3 && addressSpace != 4) + return value; + + llvm::Type *elTy = pt->getElementType(); + + /* convert elTy addrspace(3)* to i64* addrspace(3)* */ + llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace); + value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1"); + + /* convert i64* addrspace(3) to i64* */ + llvm::Function *__cvt2gen = m->module->getFunction( + addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen"); + + std::vector __cvt2gen_args; + __cvt2gen_args.push_back(value); + value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock()); + + /* compute offset */ + if (addressSpace == 3) + { + assert(elTy->isArrayTy()); + const int numElTot = elTy->getArrayNumElements(); + const int numEl = numElTot/4; +#if 0 + fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl); +#endif + llvm::ArrayType *arrTy = llvm::dyn_cast(pt->getArrayElementType()); + assert(arrTy != NULL); + llvm::Type *arrElTy = arrTy->getElementType(); +#if 0 + if (arrElTy->isArrayTy()) + Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array"); +#endif + + /* convert i64* to errElTy* */ + llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0); + value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2"); + + llvm::Function *func_warp_index = m->module->getFunction("__warp_index"); + llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector(), "gep2gen_warp_index"); + llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset"); + value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock()); + } + + /* convert arrElTy* to elTy* */ + llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0); + value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3"); + + return value; +} void DeclStmt::EmitCode(FunctionEmitContext *ctx) const { @@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { return; } + if (sym->storageClass == SC_STATIC) { + + if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType()) + PerformanceWarning(sym->pos, + "Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.", + sym->name.c_str()); + if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType()) + PerformanceWarning(sym->pos, + "\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.", + sym->name.c_str()); + if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType()) + PerformanceWarning(sym->pos, + "\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.", + sym->name.c_str()); + // For static variables, we need a compile-time constant value // for its initializer; if there's no initializer, we use a // zero value. @@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { if (cinit == NULL) cinit = llvm::Constant::getNullValue(llvmType); + int addressSpace = 0; + if (g->target->getISA() == Target::NVPTX && + sym->type->IsConstType() && + sym->type->IsUniformType()) + addressSpace = 4; + // Allocate space for the static variable in global scope, so // that it persists across function calls sym->storagePtr = new llvm::GlobalVariable(*m->module, llvmType, sym->type->IsConstType(), llvm::GlobalValue::InternalLinkage, cinit, - llvm::Twine("static.") + + llvm::Twine("static_") + llvm::Twine(sym->pos.first_line) + - llvm::Twine(".") + sym->name.c_str()); + llvm::Twine("_") + sym->name.c_str(), + NULL, + llvm::GlobalVariable::NotThreadLocal, + addressSpace); + sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos); // Tell the FunctionEmitContext about the variable ctx->EmitVariableDebugInfo(sym); } - else { + else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) && + /* NVPTX: + * only non-constant uniform data types are stored in shared memory + * constant uniform are automatically promoted to varying + */ + !sym->type->IsConstType() && +#if 1 + sym->type->IsArrayType() && +#endif + g->target->getISA() == Target::NVPTX) + { + PerformanceWarning(sym->pos, + "Non-constant \"uniform\" data types might be slow with \"nvptx\" target. " + "Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible."); + + /* with __shared__ memory everything must be an array */ + int nel = 4; + ArrayType *nat; + bool variable = true; + if (sym->type->IsArrayType()) + { + const ArrayType *at = CastType(sym->type); + /* we must scale # elements by 4, because a thread-block will run 4 warps + * or 128 threads. + * ***note-to-me***:please define these value (128threads/4warps) + * in nvptx-target definition + * instead of compile-time constants + */ + nel *= at->GetElementCount(); + if (sym->type->IsSOAType()) + nel *= sym->type->GetSOAWidth(); + nat = new ArrayType(at->GetElementType(), nel); + variable = false; + } + else + nat = new ArrayType(sym->type, nel); + + llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx); + llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn); + + sym->storagePtr = + new llvm::GlobalVariable(*m->module, llvmTypeUn, + sym->type->IsConstType(), + llvm::GlobalValue::InternalLinkage, + cinit, + llvm::Twine("local_") + + llvm::Twine(sym->pos.first_line) + + llvm::Twine("_") + sym->name.c_str(), + NULL, + llvm::GlobalVariable::NotThreadLocal, + /*AddressSpace=*/3); + sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable); + llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0); + sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl"); + + // Tell the FunctionEmitContext about the variable; must do + // this before the initializer stuff. + ctx->EmitVariableDebugInfo(sym); + + if (initExpr == 0 && sym->type->IsConstType()) + Error(sym->pos, "Missing initializer for const variable " + "\"%s\".", sym->name.c_str()); + + // And then get it initialized... + sym->parentFunction = ctx->GetFunction(); + InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos); + } + else + { // For non-static variables, allocate storage on the stack sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str()); @@ -253,10 +402,14 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const { // this before the initializer stuff. ctx->EmitVariableDebugInfo(sym); + if (initExpr == 0 && sym->type->IsConstType()) + Error(sym->pos, "Missing initializer for const variable " + "\"%s\".", sym->name.c_str()); + // And then get it initialized... sym->parentFunction = ctx->GetFunction(); InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos); - } + } } } @@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const { if (testValue == NULL) return; + +#if 0 + if (!isUniform && g->target->getISA() == Target::NVPTX) + { + /* With "nvptx" target, SIMT hardware takes care of non-uniform + * control flow. We trick ISPC to generate uniform control flow. + */ + testValue = ctx->ExtractInst(testValue, 0); + isUniform = true; + } +#endif + + if (isUniform) { ctx->StartUniformIf(); if (doAllCheck) @@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, // Do any of the program instances want to run the 'true' // block? If not, jump ahead to bNext. +#if 1 llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask()); +#else + llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0); +#endif ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ); // Emit statements for true @@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask, // Similarly, check to see if any of the instances want to // run the 'false' block... +#if 1 llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask()); +#else + llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0); +#endif ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ); // Emit code for false @@ -1273,7 +1447,10 @@ static llvm::Value * lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, llvm::Value *uniformCounterPtr, llvm::Value *varyingCounterPtr, - const std::vector &spans) { + const std::vector &spans) +{ + if (g->target->getISA() != Target::NVPTX) + { // Smear the uniform counter value out to be varying llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); llvm::Value *smearCounter = ctx->BroadcastValue( @@ -1306,6 +1483,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx, LLVMInt32Vector(delta), "iter_val"); ctx->StoreInst(varyingCounter, varyingCounterPtr); return varyingCounter; + } + else /* NVPTX == true */ + { + // Smear the uniform counter value out to be varying + llvm::Value *counter = ctx->LoadInst(uniformCounterPtr); + llvm::Value *smearCounter = ctx->BroadcastValue( + counter, LLVMTypes::Int32VectorType, "smear_counter"); + + // Figure out the offsets; this is a little bit tricky. As an example, + // consider a 2D tiled foreach loop, where we're running 8-wide and + // where the inner dimension has a stride of 4 and the outer dimension + // has a stride of 2. For the inner dimension, we want the offsets + // (0,1,2,3,0,1,2,3), and for the outer dimension we want + // (0,0,0,0,1,1,1,1). + int32_t delta[ISPC_MAX_NVEC]; + const int vecWidth = 32; + std::vector constDeltaList; + for (int i = 0; i < vecWidth; ++i) + { + int d = i; + // First, account for the effect of any dimensions at deeper + // nesting levels than the current one. + int prevDimSpanCount = 1; + for (int j = dim; j < nDims-1; ++j) + prevDimSpanCount *= spans[j+1]; + d /= prevDimSpanCount; + + // And now with what's left, figure out our own offset + delta[i] = d % spans[dim]; + constDeltaList.push_back(LLVMInt8(delta[i])); + } + + llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32); +// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */ + + + llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable( + /*Module=*/*m->module, + /*Type=*/ArrayDelta, + /*isConstant=*/true, + /*Linkage=*/llvm::GlobalValue::PrivateLinkage, + /*Initializer=*/0, // has initializer, specified below + /*Name=*/"constDeltaForeach"); +#if 0 + /*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal, + /*unsigned AddressSpace=*/4 /*constant*/); +#endif + + + llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList); + + globalDelta->setInitializer(constDelta); + llvm::Function *func_program_index = m->module->getFunction("__program_index"); + llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector(), "foreach__programIndex"); + + std::vector ptr_arrayidx_indices; + ptr_arrayidx_indices.push_back(LLVMInt32(0)); + ptr_arrayidx_indices.push_back(laneIdx); +#if 1 + llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock()); + llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock()); + llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type); + + llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1); + llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2); + + llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create( +// llvm::UndefValue(LLVMInt32Vector), + const_packed_41, + int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock()); +#endif + + + // Add the deltas to compute the varying counter values; store the + // result to memory and then return it directly as well. +#if 0 + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + LLVMInt32Vector(delta), "iter_val"); +#else + llvm::Value *varyingCounter = + ctx->BinaryOperator(llvm::Instruction::Add, smearCounter, + packed_43, "iter_val"); +#endif + ctx->StoreInst(varyingCounter, varyingCounterPtr); + return varyingCounter; + } } @@ -1383,7 +1647,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // This should be caught during typechecking AssertPos(pos, startExprs.size() == dimVariables.size() && - endExprs.size() == dimVariables.size()); + endExprs.size() == dimVariables.size()); int nDims = (int)dimVariables.size(); /////////////////////////////////////////////////////////////////////// @@ -1394,64 +1658,66 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { std::vector nExtras, alignedEnd, extrasMaskPtrs; std::vector span(nDims, 0); - lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]); + const int vectorWidth = + g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth(); + lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]); for (int i = 0; i < nDims; ++i) { - // Basic blocks that we'll fill in later with the looping logic for - // this dimension. - bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); - if (i < nDims-1) - // stepping for the innermost dimension is handled specially - bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); - bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); + // Basic blocks that we'll fill in later with the looping logic for + // this dimension. + bbReset.push_back(ctx->CreateBasicBlock("foreach_reset")); + if (i < nDims-1) + // stepping for the innermost dimension is handled specially + bbStep.push_back(ctx->CreateBasicBlock("foreach_step")); + bbTest.push_back(ctx->CreateBasicBlock("foreach_test")); - // Start and end value for this loop dimension - llvm::Value *sv = startExprs[i]->GetValue(ctx); - llvm::Value *ev = endExprs[i]->GetValue(ctx); - if (sv == NULL || ev == NULL) - return; - startVals.push_back(sv); - endVals.push_back(ev); + // Start and end value for this loop dimension + llvm::Value *sv = startExprs[i]->GetValue(ctx); + llvm::Value *ev = endExprs[i]->GetValue(ctx); + if (sv == NULL || ev == NULL) + return; + startVals.push_back(sv); + endVals.push_back(ev); - // nItems = endVal - startVal - llvm::Value *nItems = - ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); + // nItems = endVal - startVal + llvm::Value *nItems = + ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems"); - // nExtras = nItems % (span for this dimension) - // This gives us the number of extra elements we need to deal with - // at the end of the loop for this dimension that don't fit cleanly - // into a vector width. - nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, - LLVMInt32(span[i]), "nextras")); + // nExtras = nItems % (span for this dimension) + // This gives us the number of extra elements we need to deal with + // at the end of the loop for this dimension that don't fit cleanly + // into a vector width. + nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems, + LLVMInt32(span[i]), "nextras")); - // alignedEnd = endVal - nExtras - alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, - nExtras[i], "aligned_end")); + // alignedEnd = endVal - nExtras + alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev, + nExtras[i], "aligned_end")); - /////////////////////////////////////////////////////////////////////// - // Each dimension has a loop counter that is a uniform value that - // goes from startVal to endVal, in steps of the span for this - // dimension. Its value is only used internally here for looping - // logic and isn't directly available in the user's program code. - uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, - "counter")); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + /////////////////////////////////////////////////////////////////////// + // Each dimension has a loop counter that is a uniform value that + // goes from startVal to endVal, in steps of the span for this + // dimension. Its value is only used internally here for looping + // logic and isn't directly available in the user's program code. + uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type, + "counter")); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - // There is also a varying variable that holds the set of index - // values for each dimension in the current loop iteration; this is - // the value that is program-visible. - dimVariables[i]->storagePtr = - ctx->AllocaInst(LLVMTypes::Int32VectorType, - dimVariables[i]->name.c_str()); - dimVariables[i]->parentFunction = ctx->GetFunction(); - ctx->EmitVariableDebugInfo(dimVariables[i]); + // There is also a varying variable that holds the set of index + // values for each dimension in the current loop iteration; this is + // the value that is program-visible. + dimVariables[i]->storagePtr = + ctx->AllocaInst(LLVMTypes::Int32VectorType, + dimVariables[i]->name.c_str()); + dimVariables[i]->parentFunction = ctx->GetFunction(); + ctx->EmitVariableDebugInfo(dimVariables[i]); - // Each dimension also maintains a mask that represents which of - // the varying elements in the current iteration should be - // processed. (i.e. this is used to disable the lanes that have - // out-of-bounds offsets.) - extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + // Each dimension also maintains a mask that represents which of + // the varying elements in the current iteration should be + // processed. (i.e. this is used to disable the lanes that have + // out-of-bounds offsets.) + extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask")); + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); } ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR); @@ -1464,14 +1730,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // a given dimension in preparation for running through its loop again, // after the enclosing level advances its counter. for (int i = 0; i < nDims; ++i) { - ctx->SetCurrentBasicBlock(bbReset[i]); - if (i == 0) - ctx->BranchInst(bbExit); - else { - ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); - ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); - ctx->BranchInst(bbStep[i-1]); - } + ctx->SetCurrentBasicBlock(bbReset[i]); + if (i == 0) + ctx->BranchInst(bbExit); + else { + ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]); + ctx->StoreInst(startVals[i], uniformCounterPtrs[i]); + ctx->BranchInst(bbStep[i-1]); + } } /////////////////////////////////////////////////////////////////////////// @@ -1481,67 +1747,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // this for the innermost dimension, which has a more complex stepping // structure.. for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbStep[i]); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[i]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[i]); - ctx->BranchInst(bbTest[i]); + ctx->SetCurrentBasicBlock(bbStep[i]); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[i]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[i]); + ctx->BranchInst(bbTest[i]); } /////////////////////////////////////////////////////////////////////////// // foreach_test (for all dimensions other than the innermost...) std::vector inExtras; for (int i = 0; i < nDims-1; ++i) { - ctx->SetCurrentBasicBlock(bbTest[i]); + ctx->SetCurrentBasicBlock(bbTest[i]); - llvm::Value *haveExtras = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, - endVals[i], alignedEnd[i], "have_extras"); + llvm::Value *haveExtras = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT, + endVals[i], alignedEnd[i], "have_extras"); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); - llvm::Value *atAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, - counter, alignedEnd[i], "at_aligned_end"); - llvm::Value *inEx = - ctx->BinaryOperator(llvm::Instruction::And, haveExtras, - atAlignedEnd, "in_extras"); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter"); + llvm::Value *atAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + counter, alignedEnd[i], "at_aligned_end"); + llvm::Value *inEx = + ctx->BinaryOperator(llvm::Instruction::And, haveExtras, + atAlignedEnd, "in_extras"); - if (i == 0) - inExtras.push_back(inEx); - else - inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, - inExtras[i-1], "in_extras_all")); + if (i == 0) + inExtras.push_back(inEx); + else + inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx, + inExtras[i-1], "in_extras_all")); - llvm::Value *varyingCounter = - lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], - dimVariables[i]->storagePtr, span); + llvm::Value *varyingCounter = + lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i], + dimVariables[i]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[i], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[i], LLVMTypes::Int32VectorType, "smear_end"); - // Do a vector compare of its value to the end value to generate a - // mask for this last bit of work. - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); + // Do a vector compare of its value to the end value to generate a + // mask for this last bit of work. + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); - if (i == 0) - ctx->StoreInst(emask, extrasMaskPtrs[i]); - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->StoreInst(newMask, extrasMaskPtrs[i]); - } + if (i == 0) + ctx->StoreInst(emask, extrasMaskPtrs[i]); + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->StoreInst(newMask, extrasMaskPtrs[i]); + } - llvm::Value *notAtEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[i]); - ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); + llvm::Value *notAtEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[i]); + ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd); } /////////////////////////////////////////////////////////////////////////// @@ -1578,18 +1844,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // (i.e. processing extra elements that don't exactly fit into a // vector). llvm::BasicBlock *bbOuterInExtras = - ctx->CreateBasicBlock("outer_in_extras"); + ctx->CreateBasicBlock("outer_in_extras"); llvm::BasicBlock *bbOuterNotInExtras = - ctx->CreateBasicBlock("outer_not_in_extras"); + ctx->CreateBasicBlock("outer_not_in_extras"); ctx->SetCurrentBasicBlock(bbTest[nDims-1]); if (inExtras.size()) - ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, - inExtras.back()); + ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras, + inExtras.back()); else - // for a 1D iteration domain, we certainly don't have any enclosing - // dimensions that are processing extra elements. - ctx->BranchInst(bbOuterNotInExtras); + // for a 1D iteration domain, we certainly don't have any enclosing + // dimensions that are processing extra elements. + ctx->BranchInst(bbOuterNotInExtras); /////////////////////////////////////////////////////////////////////////// // One or more outer dimensions in extras, so we need to mask for the loop @@ -1604,21 +1870,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // // run loop body with mask // } llvm::BasicBlock *bbAllInnerPartialOuter = - ctx->CreateBasicBlock("all_inner_partial_outer"); + ctx->CreateBasicBlock("all_inner_partial_outer"); llvm::BasicBlock *bbPartial = - ctx->CreateBasicBlock("both_partial"); + ctx->CreateBasicBlock("both_partial"); ctx->SetCurrentBasicBlock(bbOuterInExtras); { - // Update the varying counter value here, since all subsequent - // blocks along this path need it. - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); + // Update the varying counter value here, since all subsequent + // blocks along this path need it. + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); - // here we just check to see if counter < alignedEnd - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); + // here we just check to see if counter < alignedEnd + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd); } // Below we have a basic block that runs the loop body code for the @@ -1637,53 +1903,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // should step the loop counter for the next enclosing dimension // instead. llvm::Value *stepIndexAfterMaskedBodyPtr = - ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); + ctx->AllocaInst(LLVMTypes::BoolType, "step_index"); /////////////////////////////////////////////////////////////////////////// // We're in the inner loop part where the only masking is due to outer // dimensions but the innermost dimension fits fully into a vector's // width. Set the mask and jump to the masked loop body. ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); { - llvm::Value *mask; - if (nDims == 1) - // 1D loop; we shouldn't ever get here anyway - mask = LLVMMaskAllOff; - else - mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *mask; + if (nDims == 1) + // 1D loop; we shouldn't ever get here anyway + mask = LLVMMaskAllOff; + else + mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - ctx->SetInternalMask(mask); + ctx->SetInternalMask(mask); - ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// // We need to include the effect of the innermost dimension in the mask // for the final bits here ctx->SetCurrentBasicBlock(bbPartial); { - llvm::Value *varyingCounter = - ctx->LoadInst(dimVariables[nDims-1]->storagePtr); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *varyingCounter = + ctx->LoadInst(dimVariables[nDims-1]->storagePtr); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); - if (nDims == 1) { - ctx->SetInternalMask(emask); - } - else { - llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); - llvm::Value *newMask = - ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, - "extras_mask"); - ctx->SetInternalMask(newMask); - } + if (nDims == 1) { + ctx->SetInternalMask(emask); + } + else { + llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]); + llvm::Value *newMask = + ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask, + "extras_mask"); + ctx->SetInternalMask(newMask); + } - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// @@ -1699,14 +1965,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // // run loop body with mask // } llvm::BasicBlock *bbPartialInnerAllOuter = - ctx->CreateBasicBlock("partial_inner_all_outer"); + ctx->CreateBasicBlock("partial_inner_all_outer"); ctx->SetCurrentBasicBlock(bbOuterNotInExtras); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeAlignedEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, alignedEnd[nDims-1], "before_aligned_end"); - ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, - beforeAlignedEnd); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeAlignedEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, alignedEnd[nDims-1], "before_aligned_end"); + ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter, + beforeAlignedEnd); } /////////////////////////////////////////////////////////////////////////// @@ -1716,26 +1982,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // value of the varying loop counter and have the statements in the // loop body emit their code. llvm::BasicBlock *bbFullBodyContinue = - ctx->CreateBasicBlock("foreach_full_continue"); + ctx->CreateBasicBlock("foreach_full_continue"); ctx->SetCurrentBasicBlock(bbFullBody); { - ctx->SetInternalMask(LLVMMaskAllOn); - ctx->SetBlockEntryMask(LLVMMaskAllOn); - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - ctx->SetContinueTarget(bbFullBodyContinue); - ctx->AddInstrumentationPoint("foreach loop body (all on)"); - stmts->EmitCode(ctx); - AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); - ctx->BranchInst(bbFullBodyContinue); + ctx->SetInternalMask(LLVMMaskAllOn); + ctx->SetBlockEntryMask(LLVMMaskAllOn); + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + ctx->SetContinueTarget(bbFullBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (all on)"); + stmts->EmitCode(ctx); + AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL); + ctx->BranchInst(bbFullBodyContinue); } ctx->SetCurrentBasicBlock(bbFullBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterNotInExtras); + ctx->RestoreContinuedLanes(); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterNotInExtras); } /////////////////////////////////////////////////////////////////////////// @@ -1743,33 +2009,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // less than the end value, in which case we need to run the body one // more time to get the extra bits. llvm::BasicBlock *bbSetInnerMask = - ctx->CreateBasicBlock("partial_inner_only"); + ctx->CreateBasicBlock("partial_inner_only"); ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); - llvm::Value *beforeFullEnd = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - counter, endVals[nDims-1], "before_full_end"); - ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter"); + llvm::Value *beforeFullEnd = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + counter, endVals[nDims-1], "before_full_end"); + ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd); } /////////////////////////////////////////////////////////////////////////// // The outer dimensions are all on, so the mask is just given by the // mask for the innermost dimension ctx->SetCurrentBasicBlock(bbSetInnerMask); { - llvm::Value *varyingCounter = - lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], - dimVariables[nDims-1]->storagePtr, span); - llvm::Value *smearEnd = ctx->BroadcastValue( - endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); - llvm::Value *emask = - ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, - varyingCounter, smearEnd); - emask = ctx->I1VecToBoolVec(emask); - ctx->SetInternalMask(emask); - ctx->SetBlockEntryMask(emask); + llvm::Value *varyingCounter = + lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1], + dimVariables[nDims-1]->storagePtr, span); + llvm::Value *smearEnd = ctx->BroadcastValue( + endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end"); + llvm::Value *emask = + ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT, + varyingCounter, smearEnd); + emask = ctx->I1VecToBoolVec(emask); + ctx->SetInternalMask(emask); + ctx->SetBlockEntryMask(emask); - ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbMaskedBody); + ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbMaskedBody); } /////////////////////////////////////////////////////////////////////////// @@ -1779,34 +2045,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const { // mask known to be all-on, which in turn leads to more efficient code // for that case. llvm::BasicBlock *bbStepInnerIndex = - ctx->CreateBasicBlock("step_inner_index"); + ctx->CreateBasicBlock("step_inner_index"); llvm::BasicBlock *bbMaskedBodyContinue = - ctx->CreateBasicBlock("foreach_masked_continue"); + ctx->CreateBasicBlock("foreach_masked_continue"); ctx->SetCurrentBasicBlock(bbMaskedBody); { - ctx->AddInstrumentationPoint("foreach loop body (masked)"); - ctx->SetContinueTarget(bbMaskedBodyContinue); - ctx->DisableGatherScatterWarnings(); - ctx->SetBlockEntryMask(ctx->GetFullMask()); - stmts->EmitCode(ctx); - ctx->EnableGatherScatterWarnings(); - ctx->BranchInst(bbMaskedBodyContinue); + ctx->AddInstrumentationPoint("foreach loop body (masked)"); + ctx->SetContinueTarget(bbMaskedBodyContinue); + ctx->DisableGatherScatterWarnings(); + ctx->SetBlockEntryMask(ctx->GetFullMask()); + stmts->EmitCode(ctx); + ctx->EnableGatherScatterWarnings(); + ctx->BranchInst(bbMaskedBodyContinue); } ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); { - ctx->RestoreContinuedLanes(); - llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); - ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); + ctx->RestoreContinuedLanes(); + llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr); + ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex); } /////////////////////////////////////////////////////////////////////////// // step the innermost index, for the case where we're doing the // innermost for loop over full vectors. ctx->SetCurrentBasicBlock(bbStepInnerIndex); { - llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); - llvm::Value *newCounter = - ctx->BinaryOperator(llvm::Instruction::Add, counter, - LLVMInt32(span[nDims-1]), "new_counter"); - ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); - ctx->BranchInst(bbOuterInExtras); + llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]); + llvm::Value *newCounter = + ctx->BinaryOperator(llvm::Instruction::Add, counter, + LLVMInt32(span[nDims-1]), "new_counter"); + ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]); + ctx->BranchInst(bbOuterInExtras); } /////////////////////////////////////////////////////////////////////////// @@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const { // math...) // Get the "program index" vector value - llvm::Value *programIndex = ctx->ProgramIndexVector(); + llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ? + ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector(); // And smear the current lane out to a vector llvm::Value *firstSet32 = @@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const { // And load the corresponding element value from the temporary // memory storing the value of the varying expr. - llvm::Value *uniqueValuePtr = + llvm::Value *uniqueValue; + if (g->target->getISA() != Target::NVPTX) + { + llvm::Value *uniqueValuePtr = ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType, - "unique_index_ptr"); - llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value"); + "unique_index_ptr"); + uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value"); + } + else /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */ + { + llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type); + uniqueValue = ctx->Extract(exprValue, firstSet32); + } // If it's a varying pointer type, need to convert from the int // type we store in the vector to the actual pointer type @@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const { } // Now we can emit code to call __do_print() - llvm::Function *printFunc = m->module->getFunction("__do_print"); + llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ? + m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx"); AssertPos(pos, printFunc); llvm::Value *mask = ctx->GetFullMask(); diff --git a/test_static_cuda.cpp b/test_static_cuda.cpp new file mode 100644 index 00000000..4e69e298 --- /dev/null +++ b/test_static_cuda.cpp @@ -0,0 +1,440 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#ifdef ISPC_IS_WINDOWS +#include +#endif // ISPC_IS_WINDOWS + +#include +#include +#include +#include +#ifdef ISPC_IS_LINUX +#include +#endif + +/******************************/ + +#include +#include +#include +#include "drvapi_error_string.h" +#include "ispc_malloc.h" + +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) +// These are the inline versions for all of the SDK helper functions +void __checkCudaErrors(CUresult err, const char *file, const int line) { + if(CUDA_SUCCESS != err) { + std::cerr << "checkCudeErrors() Driver API error = " << err << "\"" + << getCudaDrvErrorString(err) << "\" from file <" << file + << ", line " << line << "\n"; + exit(-1); + } +} + + +/******************************/ +/**** Basic CUDriver API ****/ +/******************************/ + +CUcontext context; + +static void createContext(const int deviceId = 0, const bool verbose = true) +{ + CUdevice device; + int devCount; + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&devCount)); + assert(devCount > 0); + checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0)); + + char name[128]; + checkCudaErrors(cuDeviceGetName(name, 128, device)); + if (verbose) + std::cout << "Using CUDA Device [0]: " << name << "\n"; + + int devMajor, devMinor; + checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device)); + if (verbose) + std::cout << "Device Compute Capability: " + << devMajor << "." << devMinor << "\n"; + if (devMajor < 2) { + if (verbose) + std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n"; + exit(1); + } + + // Create driver context + checkCudaErrors(cuCtxCreate(&context, 0, device)); +} +static void destroyContext() +{ + checkCudaErrors(cuCtxDestroy(context)); +} + +static CUmodule loadModule( + const char * module, + const int maxrregcount = 64, + const char cudadevrt_lib[] = "libcudadevrt.a", + const size_t log_size = 32768, + const bool print_log = true + ) +{ + CUmodule cudaModule; + // in this branch we use compilation with parameters + + CUlinkState CUState; + CUlinkState *lState = &CUState; + const int nOptions = 8; + CUjit_option options[nOptions]; + void* optionVals[nOptions]; + float walltime; + size_t logSize = log_size; + char error_log[logSize], + info_log[logSize]; + void *cuOut; + size_t outSize; + int myErr = 0; + + // Setup linker options + // Return walltime from JIT compilation + options[0] = CU_JIT_WALL_TIME; + optionVals[0] = (void*) &walltime; + // Pass a buffer for info messages + options[1] = CU_JIT_INFO_LOG_BUFFER; + optionVals[1] = (void*) info_log; + // Pass the size of the info buffer + options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; + optionVals[2] = (void*) logSize; + // Pass a buffer for error message + options[3] = CU_JIT_ERROR_LOG_BUFFER; + optionVals[3] = (void*) error_log; + // Pass the size of the error buffer + options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES; + optionVals[4] = (void*) logSize; + // Make the linker verbose + options[5] = CU_JIT_LOG_VERBOSE; + optionVals[5] = (void*) 1; + // Max # of registers/pthread + options[6] = CU_JIT_MAX_REGISTERS; + int jitRegCount = maxrregcount; + optionVals[6] = (void *)(size_t)jitRegCount; + // Caching + options[7] = CU_JIT_CACHE_MODE; + optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA; + // Create a pending linker invocation + + // Create a pending linker invocation + checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState)); + +#if 0 + if (sizeof(void *)==4) + { + // Load the PTX from the string myPtx32 + printf("Loading myPtx32[] program\n"); + // PTX May also be loaded from file, as per below. + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0); + } + else +#endif + { + // Load the PTX from the string myPtx (64-bit) + if (print_log) + fprintf(stderr, "Loading ptx..\n"); + myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0); + myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0); + // PTX May also be loaded from file, as per below. + // myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0); + } + + // Complete the linker step + myErr = cuLinkComplete(*lState, &cuOut, &outSize); + + if ( myErr != CUDA_SUCCESS ) + { + // Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above. + fprintf(stderr,"PTX Linker Error:\n%s\n",error_log); + assert(0); + } + + // Linker walltime and info_log were requested in options above. + if (print_log) + fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log); + + // Load resulting cuBin into module + checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut)); + + // Destroy the linker invocation + checkCudaErrors(cuLinkDestroy(*lState)); + return cudaModule; +} +static void unloadModule(CUmodule &cudaModule) +{ + checkCudaErrors(cuModuleUnload(cudaModule)); +} + +static CUfunction getFunction(CUmodule &cudaModule, const char * function) +{ + CUfunction cudaFunction; + checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function)); + return cudaFunction; +} + +static CUdeviceptr deviceMalloc(const size_t size) +{ + CUdeviceptr d_buf; + checkCudaErrors(cuMemAlloc(&d_buf, size)); + return d_buf; +} +static void deviceFree(CUdeviceptr d_buf) +{ + checkCudaErrors(cuMemFree(d_buf)); +} +static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size)); +} +static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size) +{ + checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size)); +} +#define deviceLaunch(func,params) \ + checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \ +checkCudaErrors( \ + cuLaunchKernel( \ + (func), \ + 1,1,1, \ + 32, 1, 1, \ + 0, NULL, (params), NULL \ + )); + +typedef CUdeviceptr devicePtr; + + +/**************/ +#include +static std::vector readBinary(const char * filename, const bool print_size = false) +{ + std::vector buffer; + FILE *fp = fopen(filename, "rb"); + if (!fp ) + { + fprintf(stderr, "file %s not found\n", filename); + assert(0); + } + fseek(fp, 0, SEEK_END); + const unsigned long long size = ftell(fp); /*calc the size needed*/ + fseek(fp, 0, SEEK_SET); + buffer.resize(size); + + if (fp == NULL){ /*ERROR detection if file == empty*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n",filename); + exit(1); + } + else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/ + fprintf(stderr, "Error: There was an Error reading the file %s \n", filename); + exit(1); + } + if (print_size) + fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size()); + return buffer; +} + +static double CUDALaunch( + void **handlePtr, + const char * func_name, + void **func_args, + const bool print_log = true, + const int maxrregcount = 64, + const char kernel_file[] = "__kernels.ptx", + const char cudadevrt_lib[] = "libcudadevrt.a", + const int log_size = 32768) +{ + fprintf(stderr, " launching kernel: %s \n", func_name); + const std::vector module_str = readBinary(kernel_file, print_log); + const char * module = &module_str[0]; + CUmodule cudaModule = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log); + CUfunction cudaFunction = getFunction(cudaModule, func_name); + deviceLaunch(cudaFunction, func_args); + checkCudaErrors(cuStreamSynchronize(0)); + unloadModule(cudaModule); + return 0.0; +} +/******************************/ + + +extern "C" { +// extern int width(); + int width() { return 32; } + extern void f_v(float *result); + extern void f_f(float *result, float *a); + extern void f_fu(float *result, float *a, float b); + extern void f_fi(float *result, float *a, int *b); + extern void f_du(float *result, double *a, double b); + extern void f_duf(float *result, double *a, float b); + extern void f_di(float *result, double *a, int *b); + extern void result(float *val); +} + + +#if defined(_WIN32) || defined(_WIN64) +#define ALIGN +#else +#define ALIGN __attribute__((aligned(64))) +#endif + +int main(int argc, char *argv[]) { + int w = width(); + assert(w <= 64); + + float returned_result[64] ALIGN; + float vfloat[64] ALIGN; + double vdouble[64] ALIGN; + int vint[64] ALIGN; + int vint2[64] ALIGN; + + const int device = 0; +#if 0 + const bool verbose = true; +#else + const bool verbose = false; +#endif + + /*******************/ + createContext(device, verbose); + /*******************/ + + devicePtr d_returned_result = deviceMalloc(64*sizeof(float)); + devicePtr d_vfloat = deviceMalloc(64*sizeof(float)); + devicePtr d_vdouble = deviceMalloc(64*sizeof(double)); + devicePtr d_vint = deviceMalloc(64*sizeof(int)); + devicePtr d_vint2 = deviceMalloc(64*sizeof(int)); + + + for (int i = 0; i < 64; ++i) { + returned_result[i] = -1e20; + vfloat[i] = i+1; + vdouble[i] = i+1; + vint[i] = 2*(i+1); + vint2[i] = i+5; + } + + memcpyH2D(d_returned_result, returned_result, 64*sizeof(float)); + memcpyH2D(d_vfloat , vfloat, 64*sizeof(float)); + memcpyH2D(d_vdouble , vdouble, 64*sizeof(double)); + memcpyH2D(d_vint , vint, 64*sizeof(int)); + memcpyH2D(d_vint2 , vint2, 64*sizeof(int)); + + + float b = 5.; + + const bool print_log = false; + const int nreg = 64; +#if (TEST_SIG == 0) + void *args[] = {&d_returned_result}; + CUDALaunch(NULL, "f_v", args, print_log, nreg); +#elif (TEST_SIG == 1) + void *args[] = {&d_returned_result, &d_vfloat}; + CUDALaunch(NULL, "f_f", args, print_log, nreg); +#elif (TEST_SIG == 2) + void *args[] = {&d_returned_result, &d_vfloat, &b}; + CUDALaunch(NULL, "f_fu", args, print_log, nreg); +#elif (TEST_SIG == 3) + void *args[] = {&d_returned_result, &d_vfloat, &vint}; + CUDALaunch(NULL, "f_fi", args, print_log, nreg); +#elif (TEST_SIG == 4) + int num = 5; + void *args[] = {&d_returned_result, &d_vdouble, &num}; + CUDALaunch(NULL, "f_du", args, print_log, nreg); +#elif (TEST_SIG == 5) + float num = 5.0f; + void *args[] = {&d_returned_result, &d_vdouble, &num}; + CUDALaunch(NULL, "f_duf", args, print_log, nreg); +#elif (TEST_SIG == 6) + void *args[] = {&d_returned_result, &d_vdouble, &v_int2}; + CUDALaunch(NULL, "f_di", args, print_log, nreg); +#else +#error "Unknown or unset TEST_SIG value" +#endif + + float expected_result[64]; + + memset(expected_result, 0, 64*sizeof(float)); + devicePtr d_expected_result = deviceMalloc(64*sizeof(float)); + memcpyH2D(d_expected_result, expected_result, 64*sizeof(float)); + void *res_args[] = {&d_expected_result}; + CUDALaunch(NULL, "result", res_args, print_log, nreg); + memcpyD2H(expected_result, d_expected_result, 64*sizeof(float)); + memcpyD2H(returned_result, d_returned_result, 64*sizeof(float)); + + deviceFree(d_returned_result); + deviceFree(d_vfloat); + deviceFree(d_vdouble); + deviceFree(d_vint); + deviceFree(d_vint2); + deviceFree(d_expected_result); + + /*******************/ + destroyContext(); + /*******************/ + + int errors = 0; + for (int i = 0; i < w; ++i) { + if (returned_result[i] != expected_result[i]) { +#ifdef EXPECT_FAILURE + // bingo, failed + return 1; +#else + printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n", + argv[0], i, returned_result[i], returned_result[i], + expected_result[i], expected_result[i]); + ++errors; +#endif // EXPECT_FAILURE + } + } + +#ifdef EXPECT_FAILURE + // Don't expect to get here + return 0; +#else + return errors > 0; +#endif +} diff --git a/test_static_nvptx.cpp b/test_static_nvptx.cpp new file mode 100644 index 00000000..0d56d06c --- /dev/null +++ b/test_static_nvptx.cpp @@ -0,0 +1,133 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#ifdef ISPC_IS_WINDOWS +#include +#endif // ISPC_IS_WINDOWS + +#include +#include +#include +#include +#ifdef ISPC_IS_LINUX +#include +#endif + +#include "ispc_malloc.h" + +#define N 32 +extern "C" { + int width() { return N; } + extern void f_v(float *result); + extern void f_f(float *result, float *a); + extern void f_fu(float *result, float *a, float b); + extern void f_fi(float *result, float *a, int *b); + extern void f_du(float *result, double *a, double b); + extern void f_duf(float *result, double *a, float b); + extern void f_di(float *result, double *a, int *b); + extern void result(float *val); +} + +int main(int argc, char *argv[]) { + int w = width(); + assert(w <= N); + + float *returned_result = new float[N*4]; + float *vfloat = new float[N*4]; + double *vdouble = new double[N*4]; + int *vint = new int[N*4]; + int *vint2 = new int[N*4]; + + for (int i = 0; i < N*4; ++i) { + returned_result[i] = -1e20; + vfloat[i] = i+1; + vdouble[i] = i+1; + vint[i] = 2*(i+1); + vint2[i] = i+5; + } + + float b = 5.; + +#if (TEST_SIG == 0) + f_v(returned_result); +#elif (TEST_SIG == 1) + f_f(returned_result, vfloat); +#elif (TEST_SIG == 2) + f_fu(returned_result, vfloat, b); +#elif (TEST_SIG == 3) + f_fi(returned_result, vfloat, vint); +#elif (TEST_SIG == 4) + f_du(returned_result, vdouble, 5.); +#elif (TEST_SIG == 5) + f_duf(returned_result, vdouble, 5.f); +#elif (TEST_SIG == 6) + f_di(returned_result, vdouble, vint2); +#else +#error "Unknown or unset TEST_SIG value" +#endif + + float *expected_result = new float[N]; + memset(expected_result, 0, N*sizeof(float)); + result(expected_result); + + int errors = 0; + for (int i = 0; i < w; ++i) { + if (returned_result[i] != expected_result[i]) + { +#ifdef EXPECT_FAILURE + // bingo, failed + return 1; +#else + printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n", + argv[0], i, returned_result[i], returned_result[i], + expected_result[i], expected_result[i]); + ++errors; +#endif // EXPECT_FAILURE + } + } + +#ifdef EXPECT_FAILURE + // Don't expect to get here + return 0; +#else + return errors > 0; +#endif +} diff --git a/tests/array-mixed-unif-vary-indexing-3.ispc b/tests/array-mixed-unif-vary-indexing-3.ispc index ab3a7a7c..c6623cf6 100644 --- a/tests/array-mixed-unif-vary-indexing-3.ispc +++ b/tests/array-mixed-unif-vary-indexing-3.ispc @@ -5,7 +5,13 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; assert(programCount <= 64); +#ifdef __NVPTX__ + uniform float * uniform xarr = uniform new uniform float[70*70]; + uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr; +#define _SHMALLOC +#else uniform float x[70][70]; +#endif for (uniform int i = 0; i < 70; ++i) for (uniform int j = 0; j < 70; ++j) x[i][j] = 2+b-5; @@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { else x[b-1][a-1] = 1; RET[programIndex] = x[4][a]; + +#ifdef _SHMALLOC + delete xarr; +#endif } export void result(uniform float RET[]) { diff --git a/tests/broadcast.ispc b/tests/broadcast.ispc index 1df835ae..6dfa1a00 100644 --- a/tests/broadcast.ispc +++ b/tests/broadcast.ispc @@ -3,7 +3,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - float b = (programCount == 1) ? 3 : broadcast(a, 2); + float b = (programCount == 1) ? 4 : broadcast(a, 2); RET[programIndex] = b; } diff --git a/tests/c-test-64.ispc b/tests/c-test-64.ispc index 3429bf91..d2602bc7 100644 --- a/tests/c-test-64.ispc +++ b/tests/c-test-64.ispc @@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 2; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 5; - RET[3] = RET[7] = RET[11] = RET[15] = 6; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 2; + RET[i+1] = 3; + RET[i+2] = 5; + RET[i+3] = 6; + } } diff --git a/tests/c-test-65.ispc b/tests/c-test-65.ispc index 9a363864..15df6367 100644 --- a/tests/c-test-65.ispc +++ b/tests/c-test-65.ispc @@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) { export void result(uniform float RET[]) { RET[programIndex] = 3; - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+3] = 29; + } } diff --git a/tests/c-test-66.ispc b/tests/c-test-66.ispc index a6c35dc7..22511604 100644 --- a/tests/c-test-66.ispc +++ b/tests/c-test-66.ispc @@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { RET[programIndex] = 32; - RET[2] = RET[6] = RET[10] = RET[14] = 38; - RET[3] = RET[7] = RET[11] = RET[15] = 39; + for (int i = 0; i < programCount; i += 4) + { + RET[i+2] = 38; + RET[i+3] = 39; + } } diff --git a/tests/cfor-array-struct-gather.ispc b/tests/cfor-array-struct-gather.ispc index c320ad7c..d433b00d 100644 --- a/tests/cfor-array-struct-gather.ispc +++ b/tests/cfor-array-struct-gather.ispc @@ -4,14 +4,14 @@ export uniform int width() { return programCount; } struct Foo { - uniform float x[17]; + uniform float x[programCount+1]; }; export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; uniform Foo foo; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo.x[i] = i; if ((int)a & 1) diff --git a/tests/cfor-gs-double-improve-multidim-1.ispc b/tests/cfor-gs-double-improve-multidim-1.ispc index ed672bd8..62124e2a 100644 --- a/tests/cfor-gs-double-improve-multidim-1.ispc +++ b/tests/cfor-gs-double-improve-multidim-1.ispc @@ -4,9 +4,9 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - uniform double udx[25][25]; - cfor (uniform int i = 0; i < 25; ++i) - cfor (uniform int j = 0; j < 25; ++j) + uniform double udx[programCount+1][programCount+1]; + cfor (uniform int i = 0; i < programCount+1; ++i) + cfor (uniform int j = 0; j < programCount+1; ++j) udx[i][j] = 10*i+j; int x = 1; diff --git a/tests/cfor-gs-improve-multidim-1.ispc b/tests/cfor-gs-improve-multidim-1.ispc index b0893617..32482ced 100644 --- a/tests/cfor-gs-improve-multidim-1.ispc +++ b/tests/cfor-gs-improve-multidim-1.ispc @@ -5,9 +5,9 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - uniform float udx[20][20]; - cfor (uniform int i = 0; i < 20; ++i) - cfor (uniform int j = 0; j < 20; ++j) + uniform float udx[programCount+1][programCount+1]; + cfor (uniform int i = 0; i < programCount+1; ++i) + cfor (uniform int j = 0; j < programCount+1; ++j) udx[i][j] = 100*i+j; int x = 1; diff --git a/tests/cfor-gs-improve-multidim-struct-1.ispc b/tests/cfor-gs-improve-multidim-struct-1.ispc index d599ceb9..0d682f9a 100644 --- a/tests/cfor-gs-improve-multidim-struct-1.ispc +++ b/tests/cfor-gs-improve-multidim-struct-1.ispc @@ -4,19 +4,27 @@ export uniform int width() { return programCount; } struct Foo { - uniform float udx[25][25]; + uniform float udx[32][32]; }; export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; +#ifndef __NVPTX__ uniform Foo f[5]; +#else /* too much shared memory allocated, nvcc fails to link */ + uniform Foo * uniform f = uniform new uniform Foo[5]; +#define _UNMALLOC +#endif cfor (uniform int i = 0; i < 5; ++i) - cfor (uniform int j = 0; j < 25; ++j) - cfor (uniform int k = 0; k < 25; ++k) + cfor (uniform int j = 0; j < 32; ++j) + cfor (uniform int k = 0; k < 32; ++k) f[i].udx[j][k] = 1000*i+100*j+k; int x = 1; RET[programIndex] = f[x+1].udx[b-4][programIndex]; +#ifdef _UNMALLOC + delete f; +#endif } export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; } diff --git a/tests/cfor-struct-gather-2.ispc b/tests/cfor-struct-gather-2.ispc index 7c615139..75da4a3f 100644 --- a/tests/cfor-struct-gather-2.ispc +++ b/tests/cfor-struct-gather-2.ispc @@ -13,9 +13,9 @@ float func(Foo foo[], int offset) { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = func(foo, (int)a); } diff --git a/tests/cfor-struct-gather-3.ispc b/tests/cfor-struct-gather-3.ispc index 7c615139..75da4a3f 100644 --- a/tests/cfor-struct-gather-3.ispc +++ b/tests/cfor-struct-gather-3.ispc @@ -13,9 +13,9 @@ float func(Foo foo[], int offset) { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = func(foo, (int)a); } diff --git a/tests/cfor-struct-gather.ispc b/tests/cfor-struct-gather.ispc index 49928a6b..9265da32 100644 --- a/tests/cfor-struct-gather.ispc +++ b/tests/cfor-struct-gather.ispc @@ -9,9 +9,9 @@ struct Foo { export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; - Foo foo[17]; + Foo foo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) + cfor (i = 0; i < programCount+1; ++i) foo[i].f = i*a; RET[programIndex] = foo[(int)a].f; } diff --git a/tests/cfor-struct-test-114.ispc b/tests/cfor-struct-test-114.ispc index 0ea2f65a..e7b83a79 100644 --- a/tests/cfor-struct-test-114.ispc +++ b/tests/cfor-struct-test-114.ispc @@ -10,9 +10,9 @@ struct Foo { export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) { float a = aFOO[programIndex]; int b = bFOO[programIndex]; - varying Foo myFoo[17]; + varying Foo myFoo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) { + cfor (i = 0; i < programCount+1; ++i) { myFoo[i].x = i; myFoo[i].f = 2*i; } diff --git a/tests/cfor-test-134.ispc b/tests/cfor-test-134.ispc index 96493dff..0e8af645 100644 --- a/tests/cfor-test-134.ispc +++ b/tests/cfor-test-134.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/cfor-test-135.ispc b/tests/cfor-test-135.ispc index 5926ba30..9f17350e 100644 --- a/tests/cfor-test-135.ispc +++ b/tests/cfor-test-135.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/cfor-test-136.ispc b/tests/cfor-test-136.ispc index 62834f67..e7ac9f75 100644 --- a/tests/cfor-test-136.ispc +++ b/tests/cfor-test-136.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/cfor-test-64.ispc b/tests/cfor-test-64.ispc index 9c51c9b0..eb2cbec0 100644 --- a/tests/cfor-test-64.ispc +++ b/tests/cfor-test-64.ispc @@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 2; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 5; - RET[3] = RET[7] = RET[11] = RET[15] = 6; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 2; + RET[i+1] = 3; + RET[i+2] = 5; + RET[i+3] = 6; + } } diff --git a/tests/cfor-test-65.ispc b/tests/cfor-test-65.ispc index a3c11c6d..28f82225 100644 --- a/tests/cfor-test-65.ispc +++ b/tests/cfor-test-65.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/cfor-test-66.ispc b/tests/cfor-test-66.ispc index d3698ffe..e53d2b94 100644 --- a/tests/cfor-test-66.ispc +++ b/tests/cfor-test-66.ispc @@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 32; - RET[1] = RET[5] = RET[9] = RET[13] = 32; - RET[2] = RET[6] = RET[10] = RET[14] = 38; - RET[3] = RET[7] = RET[11] = RET[15] = 39; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 32; + RET[i+1] = 32; + RET[i+2] = 38; + RET[i+3] = 39; + } } diff --git a/tests/cfor-unif-struct-test-114.ispc b/tests/cfor-unif-struct-test-114.ispc index 114e826d..59649fd0 100644 --- a/tests/cfor-unif-struct-test-114.ispc +++ b/tests/cfor-unif-struct-test-114.ispc @@ -8,9 +8,9 @@ struct Foo { }; export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) { int b = bFOO[programIndex]; - uniform struct Foo myFoo[17]; + uniform struct Foo myFoo[programCount+1]; uniform int i; - cfor (i = 0; i < 17; ++i) { + cfor (i = 0; i < programCount+1; ++i) { myFoo[i].x = i; myFoo[i].f = 2*i; } diff --git a/tests/const-fold-1.ispc b/tests/const-fold-1.ispc index fc4717ce..95b46cea 100644 --- a/tests/const-fold-1.ispc +++ b/tests/const-fold-1.ispc @@ -6,7 +6,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2); - static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2); + const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2); RET[programIndex] = (x == y) ? 1. : 0.; } diff --git a/tests/const-fold-2.ispc b/tests/const-fold-2.ispc index 88743d2f..4e0ea5b6 100644 --- a/tests/const-fold-2.ispc +++ b/tests/const-fold-2.ispc @@ -6,7 +6,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; uniform int x = (170 >> 4) % 5; - static uniform int y = (170 >> 4) % 5; + const static uniform int y = (170 >> 4) % 5; RET[programIndex] = (x == y) ? 1. : 0.; } diff --git a/tests/const-fold-3.ispc b/tests/const-fold-3.ispc index cf5bc915..15c49e92 100644 --- a/tests/const-fold-3.ispc +++ b/tests/const-fold-3.ispc @@ -6,7 +6,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20); - static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20); + const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20); RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.; } diff --git a/tests/launch-8.ispc b/tests/launch-8.ispc index eacba673..9855a963 100644 --- a/tests/launch-8.ispc +++ b/tests/launch-8.ispc @@ -2,22 +2,23 @@ export uniform int width() { return programCount; } -#define N0 10 +#define N0 12 #define N1 20 #define N2 50 static uniform float array[N2][N1][N0]; -task void x(const float f) { +task void x(const uniform float farray[]) { + const float f = farray[programIndex]; uniform int j; - assert(taskCount == (int32)N0*N1*N2); - assert(taskCount0 == (int32)N0); - assert(taskCount1 == (int32)N1); - assert(taskCount2 == (int32)N2); - assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2)); - assert(taskIndex0 < (int32)N0); - assert(taskIndex1 < (int32)N1); - assert(taskIndex2 < (int32)N2); + assert(taskCount == (uniform int32)N0*N1*N2); + assert(taskCount0 == (uniform int32)N0); + assert(taskCount1 == (uniform int32)N1); + assert(taskCount2 == (uniform int32)N2); + assert(taskIndex == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2)); + assert(taskIndex0 < (uniform int32)N0); + assert(taskIndex1 < (uniform int32)N1); + assert(taskIndex2 < (uniform int32)N2); const uniform int i0 = taskIndex0; const uniform int i1 = taskIndex1; @@ -30,7 +31,7 @@ task void x(const float f) { array[i2][i1][i0] = i; } export void f_f(uniform float RET[], uniform float fFOO[]) { - float f = fFOO[programIndex]; + uniform float * uniform f = fFOO; launch[N2][N1][N0] x(f); sync; RET[programIndex] = array[N2-1][N1-1][N0-1]; @@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) { export void result(uniform float RET[]) { - RET[programIndex] = 9999.000000; + RET[programIndex] = 11999.000000; } diff --git a/tests/launch-9.ispc b/tests/launch-9.ispc index 1952e8e7..dbbb9f80 100644 --- a/tests/launch-9.ispc +++ b/tests/launch-9.ispc @@ -2,12 +2,13 @@ export uniform int width() { return programCount; } -#define N0 10 +#define N0 12 #define N1 20 #define N2 50 static uniform float array[N2][N1][N0]; -task void x(const float f) { +task void x(const uniform float farray[]) { + const float f = farray[programIndex]; uniform int j; assert(taskCount == (int32)N0*N1*N2); @@ -30,13 +31,13 @@ task void x(const float f) { array[i2][i1][i0] = i; } export void f_f(uniform float RET[], uniform float fFOO[]) { - float f = fFOO[programIndex]; - launch[N0,N1,N2] x(f); + uniform float * uniform f = fFOO; + launch[N2][N1][N0] x(f); sync; RET[programIndex] = array[N2-1][N1-1][N0-1]; } export void result(uniform float RET[]) { - RET[programIndex] = 9999.000000; + RET[programIndex] = 11999.000000; } diff --git a/tests/operators2.ispc b/tests/operators2.ispc index b732b24a..daef4ec6 100644 --- a/tests/operators2.ispc +++ b/tests/operators2.ispc @@ -1,4 +1,9 @@ +#ifdef __NVPTX__ +uniform int _off[programCount]; +#define off _off[programIndex] +#else /* global varying data types are not yet supported with "nvptx" target */ int off; +#endif export uniform int width() { return programCount; } @@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) { return c; } -struct S a; -struct S b; -struct S d; export void f_f(uniform float RET[], uniform float aFOO[]) { + struct S a; + struct S b; + struct S d; int T = programIndex; a.a = aFOO[programIndex]; b.a = -aFOO[programIndex]; diff --git a/tests/soa-16.ispc b/tests/soa-16.ispc index f23c39cb..3c6ff6c4 100644 --- a/tests/soa-16.ispc +++ b/tests/soa-16.ispc @@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) { } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { +#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */ + soa<4> Point pts[10]; + for (uniform int i = 0; i < 40; ++i) { + pts[i].x = b*i; + pts[i].y[0] = 2*b*i; + pts[i].y[1] = 2*b*i+1; + pts[i].y[2] = 2*b*i+2; + pts[i].z = 3*b*i; + } +#else soa<4> Point pts[30]; for (uniform int i = 0; i < 120; ++i) { pts[i].x = b*i; @@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { pts[i].y[2] = 2*b*i+2; pts[i].z = 3*b*i; } +#endif float a = aFOO[programIndex]; a *= -1; diff --git a/tests/soa-17.ispc b/tests/soa-17.ispc index f25b85bd..5dc9ea2f 100644 --- a/tests/soa-17.ispc +++ b/tests/soa-17.ispc @@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) { } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { +#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */ + soa<4> Point pts[15]; + for (uniform int i = 0; i < 60; ++i) { + pts[i].x = b*i; + pts[i].y[0] = 2*b*i; + pts[i].y[1] = 2*b*i+1; + pts[i].y[2] = 2*b*i+2; + pts[i].z = 3*b*i; + } +#else soa<4> Point pts[40]; for (uniform int i = 0; i < 160; ++i) { pts[i].x = b*i; @@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { pts[i].y[2] = 2*b*i+2; pts[i].z = 3*b*i; } +#endif float a = aFOO[programIndex]; a *= -1; diff --git a/tests/soa-22.ispc b/tests/soa-22.ispc index 60448694..ba3ffa0c 100644 --- a/tests/soa-22.ispc +++ b/tests/soa-22.ispc @@ -25,7 +25,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { } } } - + assert(programIndex < 80); RET[programIndex] = pts[programIndex].pts[programIndex % 3][programIndex % 4].z; } diff --git a/tests/soa-3.ispc b/tests/soa-3.ispc index 2cec07a5..86c7c57c 100644 --- a/tests/soa-3.ispc +++ b/tests/soa-3.ispc @@ -6,6 +6,17 @@ export uniform int width() { return programCount; } export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float a = aFOO[programIndex]; +#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */ + soa<8> Point pts[4]; +//CO uniform Point pts[80]; + foreach (i = 0 ... 40) { + pts[i].x = b*i; + pts[i].y[0] = 2*b*i; + pts[i].y[1] = 2*b*i+1; + pts[i].y[2] = 2*b*i+2; + pts[i].z = 3*b*i; + } +#else soa<8> Point pts[10]; //CO uniform Point pts[80]; foreach (i = 0 ... 80) { @@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { pts[i].y[2] = 2*b*i+2; pts[i].z = 3*b*i; } +#endif assert(programCount < 80); RET[programIndex] = pts[programIndex].y[2]; diff --git a/tests/test-134.ispc b/tests/test-134.ispc index baa8ec37..9d4d0e94 100644 --- a/tests/test-134.ispc +++ b/tests/test-134.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-135.ispc b/tests/test-135.ispc index c350a524..bb9881e6 100644 --- a/tests/test-135.ispc +++ b/tests/test-135.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-136.ispc b/tests/test-136.ispc index ab6c6b5b..098ac456 100644 --- a/tests/test-136.ispc +++ b/tests/test-136.ispc @@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 1; - RET[1] = RET[5] = RET[9] = RET[13] = 3; - RET[2] = RET[6] = RET[10] = RET[14] = 3; - RET[3] = RET[7] = RET[11] = RET[15] = 29; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 1; + RET[i+1] = 3; + RET[i+2] = 3; + RET[i+3] = 29; + } } diff --git a/tests/test-140.ispc b/tests/test-140.ispc index a983d528..997d558e 100644 --- a/tests/test-140.ispc +++ b/tests/test-140.ispc @@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) { } export void result(uniform float RET[]) { - RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0; - RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1; - RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0; - RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0; + for (int i = 0; i < programCount; i += 4) + { + RET[i+0] = 0x0.0p+0; + RET[i+1] = 0x1.62e43p-1; + RET[i+2] = 0x1.193ea8p+0; + RET[i+3] = 0x1.62e43p+0; + } } diff --git a/tests/test-141.ispc b/tests/test-141.ispc index b69be1fa..9045c081 100644 --- a/tests/test-141.ispc +++ b/tests/test-141.ispc @@ -5,7 +5,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; // calculation error 1e-6 is the same as in icc - RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0; + RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0; } export void result(uniform float RET[4]) { diff --git a/tests/test-142.ispc b/tests/test-142.ispc index 18053402..9ab8ff9f 100644 --- a/tests/test-142.ispc +++ b/tests/test-142.ispc @@ -4,7 +4,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - RET[programIndex] = round(a+.499999); + RET[programIndex] = round(a+.49999); } export void result(uniform float RET[]) { diff --git a/tests/test-144.ispc b/tests/test-144.ispc index 568bdc10..64e1817a 100644 --- a/tests/test-144.ispc +++ b/tests/test-144.ispc @@ -4,7 +4,7 @@ export uniform int width() { return programCount; } export void f_f(uniform float RET[], uniform float aFOO[]) { float a = aFOO[programIndex]; - RET[programIndex] = floor(a+.999999); + RET[programIndex] = floor(a+.99999); } export void result(uniform float RET[]) { diff --git a/tests/uniform-1.ispc b/tests/uniform-1.ispc new file mode 100644 index 00000000..dcf4eab0 --- /dev/null +++ b/tests/uniform-1.ispc @@ -0,0 +1,34 @@ + +export uniform int width() { return programCount; } + + +task void f_f_task(uniform float RET[], uniform float aFOO[]) { + uniform float val[programCount]; + for (uniform int i = 0; i < programCount; ++i) + val[i] = 0; + + foreach (i = 0 ... programCount) + val[i] += aFOO[programCount*taskIndex + i] - 1; + + uniform float sum = 0; + for (uniform int i = 0; i < programCount; ++i) + sum += val[i]; + + if (programIndex < 32/4) + RET[programCount/4*taskIndex + programIndex] = sum; +} + +export void f_f(uniform float RET[], uniform float aFOO[]) +{ + launch[4] f_f_task(RET, aFOO); +} +task void result_task(uniform float RET[]) +{ + const uniform float ret = reduce_add(programIndex + programCount*taskIndex); + if (programIndex < 32/4) + RET[programCount/4*taskIndex + programIndex] = ret; +} + +export void result(uniform float RET[]) { + launch[4] result_task(RET); +} diff --git a/type.cpp b/type.cpp index cf7ac85d..00795737 100644 --- a/type.cpp +++ b/type.cpp @@ -749,7 +749,7 @@ EnumType::Mangle() const { std::string ret; if (isConst) ret += "C"; ret += variability.MangleString(); - ret += std::string("enum[") + name + std::string("]"); + ret += std::string("enum_5B_") + name + std::string("_5C_"); return ret; } @@ -1420,7 +1420,7 @@ ArrayType::Mangle() const { sprintf(buf, "%d", numElements); else buf[0] = '\0'; - return s + "[" + buf + "]"; + return s + "_5B_" + buf + "_5C_"; } @@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) { Assert(variability != Variability::Unbound); std::string ret; - ret += "s["; + ret += "s_5B_"; if (isConst) ret += "_c_"; ret += variability.MangleString(); - ret += name + std::string("]"); + ret += name + std::string("_5C_"); return ret; } @@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const { llvmArgTypes.push_back(LLVMTypes::MaskType); std::vector callTypes; - if (isTask) { + if (isTask && g->target->getISA() != Target::NVPTX) { // Tasks take three arguments: a pointer to a struct that holds the // actual task arguments, the thread index, and the total number of // threads the tasks system has running. (Task arguments are