diff --git a/builtins.cpp b/builtins.cpp index e3141725..91e9a16a 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -611,6 +611,14 @@ lSetInternalFunctions(llvm::Module *module) { "__vec4_add_int32", "__vselect_float", "__vselect_i32", + "__tid_x", + "__ctaid_x", + "__ctaid_y", + "__ctaid_z", + "__nctaid_x", + "__nctaid_y", + "__nctaid_z", + "__warpsize" }; int count = sizeof(names) / sizeof(names[0]); diff --git a/builtins/target-nvptx64.ll b/builtins/target-nvptx64.ll index a728803f..3da3f747 100644 --- a/builtins/target-nvptx64.ll +++ b/builtins/target-nvptx64.ll @@ -56,6 +56,66 @@ gen_scatter(i64) gen_scatter(double) +;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;; +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() nounwind readnone + +define i32 @__tid_x() nounwind readnone alwaysinline +{ + %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %tid +} +define i32 @__warpsize() nounwind readnone alwaysinline +{ + %tid = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + ret i32 %tid +} + + +define i32 @__ctaid_x() nounwind readnone alwaysinline +{ + %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + ret i32 %bid +} +define i32 @__ctaid_y() nounwind readnone alwaysinline +{ + %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + ret i32 %bid +} +define i32 @__ctaid_z() nounwind readnone alwaysinline +{ + %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + ret i32 %bid +} + +define i32 @__nctaid_x() nounwind readnone alwaysinline +{ + %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + ret i32 %nb +} +define i32 @__nctaid_y() nounwind readnone alwaysinline +{ + %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + ret i32 %nb +} +define i32 @__nctaid_z() nounwind readnone alwaysinline +{ + %nb = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + ret i32 %nb +} + +;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;; + define <1 x i8> @__vselect_i8(<1 x i8>, <1 x i8> , <1 x i32> %mask) nounwind readnone alwaysinline { ; %mv = trunc <1 x i32> %mask to <1 x i8> diff --git a/func.cpp b/func.cpp index af2cc05a..d5b1f3f9 100644 --- a/func.cpp +++ b/func.cpp @@ -281,18 +281,35 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function, taskCountSym->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount"); ctx->StoreInst(taskCount, taskCountSym->storagePtr); + + /* nvptx map: + * programCount : llvm.nvvm.read.ptx.sreg.warpsize + * programIndex : llvm.ptx.read.laneid _or_ ed.ptx.sreg.tid.llvm.nvvm.read.ptx.sreg.tid.x & programCount + * taskIndex0 : llvm.nvvm.read.ptx.sreg.ctaid.x + * taskIndex1 : llvm.nvvm.read.ptx.sreg.ctaid.y + * taskIndex3 : llvm.nvvm.read.ptx.sreg.ctaid.z + * taskCount0 : llvm.nvvm.read.ptx.sreg.nctaid.x + * taskCount1 : llvm.nvvm.read.ptx.sreg.nctaid.y + * taskCount3 : llvm.nvvm.read.ptx.sreg.nctaid.z + */ + // llvm.nvvm.read.ptx.sreg.ctaid.x taskIndexSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex0"); ctx->StoreInst(taskIndex0, taskIndexSym0->storagePtr); + // llvm.nvvm.read.ptx.sreg.ctaid.y taskIndexSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex1"); ctx->StoreInst(taskIndex1, taskIndexSym1->storagePtr); + // llvm.nvvm.read.ptx.sreg.ctaid.z taskIndexSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskIndex2"); ctx->StoreInst(taskIndex2, taskIndexSym2->storagePtr); + // llvm.nvvm.read.ptx.sreg.nctaid.x taskCountSym0->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount0"); ctx->StoreInst(taskCount0, taskCountSym0->storagePtr); + // llvm.nvvm.read.ptx.sreg.nctaid.y taskCountSym1->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount1"); ctx->StoreInst(taskCount1, taskCountSym1->storagePtr); + // llvm.nvvm.read.ptx.sreg.nctaid.z taskCountSym2->storagePtr = ctx->AllocaInst(LLVMTypes::Int32Type, "taskCount2"); ctx->StoreInst(taskCount2, taskCountSym2->storagePtr); } diff --git a/stdlib.ispc b/stdlib.ispc index 9b02d0ba..dc2a76db 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -57,6 +57,54 @@ #error Unknown value of ISPC_MASK_BITS #endif + + +/////////////////////////////////////////////////////////////////////////// +// CUDA Specific primitives +// +#define CUDABLOCKSIZE 128 +__declspec(safe,cost0) + static inline uniform int blockIndex0() +{ + return __ctaid_x(); +} +__declspec(safe,cost0) + static inline uniform int blockIndex1() +{ + return __ctaid_y(); +} +__declspec(safe,cost0) + static inline uniform int blockIndex2() +{ + return __ctaid_y(); +} + +__declspec(safe,cost0) + static inline uniform int blockCount0() +{ + return __nctaid_x(); +} +__declspec(safe,cost0) + static inline uniform int blockCount1() +{ + return __nctaid_y(); +} +__declspec(safe,cost0) + static inline uniform int blockCount2() +{ + return __nctaid_z(); +} +__declspec(safe,cost0) + static inline uniform int warpSize() +{ + return __warpsize(); +} +__declspec(safe,cost0) + static inline uniform int laneIndex() +{ + return __tid_x() & (warpSize()-1); +} + /////////////////////////////////////////////////////////////////////////// // Low level primitives