merged with nvptx
This commit is contained in:
81
builtins.cpp
81
builtins.cpp
@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__all",
|
"__all",
|
||||||
"__any",
|
"__any",
|
||||||
"__aos_to_soa3_float",
|
"__aos_to_soa3_float",
|
||||||
|
"__aos_to_soa3_float1",
|
||||||
"__aos_to_soa3_float16",
|
"__aos_to_soa3_float16",
|
||||||
"__aos_to_soa3_float4",
|
"__aos_to_soa3_float4",
|
||||||
"__aos_to_soa3_float8",
|
"__aos_to_soa3_float8",
|
||||||
"__aos_to_soa3_int32",
|
"__aos_to_soa3_int32",
|
||||||
"__aos_to_soa4_float",
|
"__aos_to_soa4_float",
|
||||||
|
"__aos_to_soa4_float1",
|
||||||
"__aos_to_soa4_float16",
|
"__aos_to_soa4_float16",
|
||||||
"__aos_to_soa4_float4",
|
"__aos_to_soa4_float4",
|
||||||
"__aos_to_soa4_float8",
|
"__aos_to_soa4_float8",
|
||||||
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__atomic_add_int64_global",
|
"__atomic_add_int64_global",
|
||||||
"__atomic_add_uniform_int32_global",
|
"__atomic_add_uniform_int32_global",
|
||||||
"__atomic_add_uniform_int64_global",
|
"__atomic_add_uniform_int64_global",
|
||||||
|
"__atomic_add_varying_int32_global",
|
||||||
|
"__atomic_add_varying_int64_global",
|
||||||
"__atomic_and_int32_global",
|
"__atomic_and_int32_global",
|
||||||
"__atomic_and_int64_global",
|
"__atomic_and_int64_global",
|
||||||
"__atomic_and_uniform_int32_global",
|
"__atomic_and_uniform_int32_global",
|
||||||
"__atomic_and_uniform_int64_global",
|
"__atomic_and_uniform_int64_global",
|
||||||
|
"__atomic_and_varying_int32_global",
|
||||||
|
"__atomic_and_varying_int64_global",
|
||||||
"__atomic_compare_exchange_double_global",
|
"__atomic_compare_exchange_double_global",
|
||||||
"__atomic_compare_exchange_float_global",
|
"__atomic_compare_exchange_float_global",
|
||||||
"__atomic_compare_exchange_int32_global",
|
"__atomic_compare_exchange_int32_global",
|
||||||
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__atomic_compare_exchange_uniform_float_global",
|
"__atomic_compare_exchange_uniform_float_global",
|
||||||
"__atomic_compare_exchange_uniform_int32_global",
|
"__atomic_compare_exchange_uniform_int32_global",
|
||||||
"__atomic_compare_exchange_uniform_int64_global",
|
"__atomic_compare_exchange_uniform_int64_global",
|
||||||
|
"__atomic_compare_exchange_varying_double_global",
|
||||||
|
"__atomic_compare_exchange_varying_float_global",
|
||||||
|
"__atomic_compare_exchange_varying_int32_global",
|
||||||
|
"__atomic_compare_exchange_varying_int64_global",
|
||||||
"__atomic_max_uniform_int32_global",
|
"__atomic_max_uniform_int32_global",
|
||||||
"__atomic_max_uniform_int64_global",
|
"__atomic_max_uniform_int64_global",
|
||||||
"__atomic_min_uniform_int32_global",
|
"__atomic_min_uniform_int32_global",
|
||||||
"__atomic_min_uniform_int64_global",
|
"__atomic_min_uniform_int64_global",
|
||||||
|
"__atomic_max_varying_int32_global",
|
||||||
|
"__atomic_max_varying_int64_global",
|
||||||
|
"__atomic_min_varying_int32_global",
|
||||||
|
"__atomic_min_varying_int64_global",
|
||||||
"__atomic_or_int32_global",
|
"__atomic_or_int32_global",
|
||||||
"__atomic_or_int64_global",
|
"__atomic_or_int64_global",
|
||||||
"__atomic_or_uniform_int32_global",
|
"__atomic_or_uniform_int32_global",
|
||||||
"__atomic_or_uniform_int64_global",
|
"__atomic_or_uniform_int64_global",
|
||||||
|
"__atomic_or_varying_int32_global",
|
||||||
|
"__atomic_or_varying_int64_global",
|
||||||
"__atomic_sub_int32_global",
|
"__atomic_sub_int32_global",
|
||||||
"__atomic_sub_int64_global",
|
"__atomic_sub_int64_global",
|
||||||
"__atomic_sub_uniform_int32_global",
|
"__atomic_sub_uniform_int32_global",
|
||||||
"__atomic_sub_uniform_int64_global",
|
"__atomic_sub_uniform_int64_global",
|
||||||
|
"__atomic_sub_varying_int32_global",
|
||||||
|
"__atomic_sub_varying_int64_global",
|
||||||
"__atomic_swap_double_global",
|
"__atomic_swap_double_global",
|
||||||
"__atomic_swap_float_global",
|
"__atomic_swap_float_global",
|
||||||
"__atomic_swap_int32_global",
|
"__atomic_swap_int32_global",
|
||||||
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__atomic_swap_uniform_float_global",
|
"__atomic_swap_uniform_float_global",
|
||||||
"__atomic_swap_uniform_int32_global",
|
"__atomic_swap_uniform_int32_global",
|
||||||
"__atomic_swap_uniform_int64_global",
|
"__atomic_swap_uniform_int64_global",
|
||||||
|
"__atomic_swap_varying_double_global",
|
||||||
|
"__atomic_swap_varying_float_global",
|
||||||
|
"__atomic_swap_varying_int32_global",
|
||||||
|
"__atomic_swap_varying_int64_global",
|
||||||
"__atomic_umax_uniform_uint32_global",
|
"__atomic_umax_uniform_uint32_global",
|
||||||
"__atomic_umax_uniform_uint64_global",
|
"__atomic_umax_uniform_uint64_global",
|
||||||
"__atomic_umin_uniform_uint32_global",
|
"__atomic_umin_uniform_uint32_global",
|
||||||
"__atomic_umin_uniform_uint64_global",
|
"__atomic_umin_uniform_uint64_global",
|
||||||
|
"__atomic_umax_varying_uint32_global",
|
||||||
|
"__atomic_umax_varying_uint64_global",
|
||||||
|
"__atomic_umin_varying_uint32_global",
|
||||||
|
"__atomic_umin_varying_uint64_global",
|
||||||
"__atomic_xor_int32_global",
|
"__atomic_xor_int32_global",
|
||||||
"__atomic_xor_int64_global",
|
"__atomic_xor_int64_global",
|
||||||
"__atomic_xor_uniform_int32_global",
|
"__atomic_xor_uniform_int32_global",
|
||||||
"__atomic_xor_uniform_int64_global",
|
"__atomic_xor_uniform_int64_global",
|
||||||
|
"__atomic_xor_uniform_int32_global",
|
||||||
|
"__atomic_xor_uniform_int64_global",
|
||||||
|
"__atomic_xor_varying_int32_global",
|
||||||
|
"__atomic_xor_varying_int64_global",
|
||||||
|
"__atomic_xor_varying_int32_global",
|
||||||
|
"__atomic_xor_varying_int64_global",
|
||||||
"__broadcast_double",
|
"__broadcast_double",
|
||||||
"__broadcast_float",
|
"__broadcast_float",
|
||||||
"__broadcast_i16",
|
"__broadcast_i16",
|
||||||
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__do_assert_uniform",
|
"__do_assert_uniform",
|
||||||
"__do_assert_varying",
|
"__do_assert_varying",
|
||||||
"__do_print",
|
"__do_print",
|
||||||
|
"__do_print_nvptx",
|
||||||
"__doublebits_uniform_int64",
|
"__doublebits_uniform_int64",
|
||||||
"__doublebits_varying_int64",
|
"__doublebits_varying_int64",
|
||||||
"__exclusive_scan_add_double",
|
"__exclusive_scan_add_double",
|
||||||
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__extract_int32",
|
"__extract_int32",
|
||||||
"__extract_int64",
|
"__extract_int64",
|
||||||
"__extract_int8",
|
"__extract_int8",
|
||||||
|
"__extract_float",
|
||||||
|
"__extract_double",
|
||||||
"__fastmath",
|
"__fastmath",
|
||||||
"__float_to_half_uniform",
|
"__float_to_half_uniform",
|
||||||
"__float_to_half_varying",
|
"__float_to_half_varying",
|
||||||
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__insert_int32",
|
"__insert_int32",
|
||||||
"__insert_int64",
|
"__insert_int64",
|
||||||
"__insert_int8",
|
"__insert_int8",
|
||||||
|
"__insert_float",
|
||||||
|
"__insert_double",
|
||||||
"__intbits_uniform_double",
|
"__intbits_uniform_double",
|
||||||
"__intbits_uniform_float",
|
"__intbits_uniform_float",
|
||||||
"__intbits_varying_double",
|
"__intbits_varying_double",
|
||||||
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__min_varying_uint32",
|
"__min_varying_uint32",
|
||||||
"__min_varying_uint64",
|
"__min_varying_uint64",
|
||||||
"__movmsk",
|
"__movmsk",
|
||||||
|
"__movmsk_ptx",
|
||||||
"__new_uniform_32rt",
|
"__new_uniform_32rt",
|
||||||
"__new_uniform_64rt",
|
"__new_uniform_64rt",
|
||||||
"__new_varying32_32rt",
|
"__new_varying32_32rt",
|
||||||
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__shuffle_i64",
|
"__shuffle_i64",
|
||||||
"__shuffle_i8",
|
"__shuffle_i8",
|
||||||
"__soa_to_aos3_float",
|
"__soa_to_aos3_float",
|
||||||
|
"__soa_to_aos3_float1",
|
||||||
"__soa_to_aos3_float16",
|
"__soa_to_aos3_float16",
|
||||||
"__soa_to_aos3_float4",
|
"__soa_to_aos3_float4",
|
||||||
"__soa_to_aos3_float8",
|
"__soa_to_aos3_float8",
|
||||||
"__soa_to_aos3_int32",
|
"__soa_to_aos3_int32",
|
||||||
"__soa_to_aos4_float",
|
"__soa_to_aos4_float",
|
||||||
|
"__soa_to_aos4_float1",
|
||||||
"__soa_to_aos4_float16",
|
"__soa_to_aos4_float16",
|
||||||
"__soa_to_aos4_float4",
|
"__soa_to_aos4_float4",
|
||||||
"__soa_to_aos4_float8",
|
"__soa_to_aos4_float8",
|
||||||
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
|
|||||||
"__vec4_add_int32",
|
"__vec4_add_int32",
|
||||||
"__vselect_float",
|
"__vselect_float",
|
||||||
"__vselect_i32",
|
"__vselect_i32",
|
||||||
|
"__program_index",
|
||||||
|
"__program_count",
|
||||||
|
"__warp_index",
|
||||||
|
"__task_index0",
|
||||||
|
"__task_index1",
|
||||||
|
"__task_index2",
|
||||||
|
"__task_index",
|
||||||
|
"__task_count0",
|
||||||
|
"__task_count1",
|
||||||
|
"__task_count2",
|
||||||
|
"__task_count",
|
||||||
|
"__cvt_loc2gen",
|
||||||
|
"__cvt_loc2gen_var",
|
||||||
|
"__cvt_const2gen",
|
||||||
|
"__puts_nvptx",
|
||||||
|
"ISPCAlloc",
|
||||||
|
"ISPCLaunch",
|
||||||
|
"ISPCSync",
|
||||||
};
|
};
|
||||||
|
|
||||||
int count = sizeof(names) / sizeof(names[0]);
|
int count = sizeof(names) / sizeof(names[0]);
|
||||||
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
|||||||
g->target->getISA() != Target::NEON16 &&
|
g->target->getISA() != Target::NEON16 &&
|
||||||
g->target->getISA() != Target::NEON8)
|
g->target->getISA() != Target::NEON8)
|
||||||
#endif // !__arm__
|
#endif // !__arm__
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
{
|
{
|
||||||
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||||
mTriple.getArch() == bcTriple.getArch());
|
mTriple.getArch() == bcTriple.getArch());
|
||||||
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
// Next, add the target's custom implementations of the various needed
|
// Next, add the target's custom implementations of the various needed
|
||||||
// builtin functions (e.g. __masked_store_32(), etc).
|
// builtin functions (e.g. __masked_store_32(), etc).
|
||||||
switch (g->target->getISA()) {
|
switch (g->target->getISA()) {
|
||||||
|
case Target::NVPTX:
|
||||||
|
{
|
||||||
|
if (runtime32) {
|
||||||
|
fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
};
|
||||||
#ifdef ISPC_ARM_ENABLED
|
#ifdef ISPC_ARM_ENABLED
|
||||||
case Target::NEON8: {
|
case Target::NEON8: {
|
||||||
if (runtime32) {
|
if (runtime32) {
|
||||||
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
}
|
}
|
||||||
|
|
||||||
// define the 'programCount' builtin variable
|
// define the 'programCount' builtin variable
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
lDefineConstantInt("programCount", 32, module, symbolTable);
|
||||||
|
}
|
||||||
|
|
||||||
// define the 'programIndex' builtin
|
// define the 'programIndex' builtin
|
||||||
lDefineProgramIndex(module, symbolTable);
|
lDefineProgramIndex(module, symbolTable);
|
||||||
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
|||||||
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
|
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
|
||||||
module, symbolTable);
|
module, symbolTable);
|
||||||
|
|
||||||
|
lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
|
||||||
|
module, symbolTable);
|
||||||
|
|
||||||
if (g->forceAlignment != -1) {
|
if (g->forceAlignment != -1) {
|
||||||
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
||||||
alignment->setInitializer(LLVMInt32(g->forceAlignment));
|
alignment->setInitializer(LLVMInt32(g->forceAlignment));
|
||||||
|
|||||||
130
builtins/__do_print_nvptx.cu
Normal file
130
builtins/__do_print_nvptx.cu
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
#include <cstdio>
|
||||||
|
|
||||||
|
#define PRINT_BUF_SIZE 4096
|
||||||
|
#define uint64_t unsigned long long
|
||||||
|
|
||||||
|
static __device__ size_t d_strlen(const char *str)
|
||||||
|
{
|
||||||
|
const char *s;
|
||||||
|
|
||||||
|
for (s = str; *s; ++s)
|
||||||
|
;
|
||||||
|
return (s - str);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ char* d_strncat(char *dest, const char *src, size_t n)
|
||||||
|
{
|
||||||
|
size_t dest_len = d_strlen(dest);
|
||||||
|
size_t i;
|
||||||
|
|
||||||
|
for (i = 0 ; i < n && src[i] != '\0' ; i++)
|
||||||
|
dest[dest_len + i] = src[i];
|
||||||
|
dest[dest_len + i] = '\0';
|
||||||
|
|
||||||
|
return dest;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define APPEND(str) \
|
||||||
|
do { \
|
||||||
|
int offset = bufp - &printString[0]; \
|
||||||
|
*bufp = '\0'; \
|
||||||
|
d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||||
|
bufp += d_strlen(str); \
|
||||||
|
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||||
|
goto done; \
|
||||||
|
} while (0) /* eat semicolon */
|
||||||
|
|
||||||
|
|
||||||
|
#define PRINT_SCALAR(fmt, type) \
|
||||||
|
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||||
|
APPEND(tmpBuf); \
|
||||||
|
break
|
||||||
|
|
||||||
|
#define PRINT_VECTOR(fmt, type) \
|
||||||
|
*bufp++ = '['; \
|
||||||
|
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||||
|
for (int i = 0; i < width; ++i) { \
|
||||||
|
/* only print the value if the current lane is executing */ \
|
||||||
|
type val0 = *((type*)ptr); \
|
||||||
|
type val = val0; \
|
||||||
|
if (mask & (1ull<<i)) \
|
||||||
|
sprintf(tmpBuf, fmt, val); \
|
||||||
|
else \
|
||||||
|
sprintf(tmpBuf, "(( * )) "); \
|
||||||
|
APPEND(tmpBuf); \
|
||||||
|
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||||
|
} \
|
||||||
|
break
|
||||||
|
|
||||||
|
extern "C"
|
||||||
|
__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||||
|
void **args) {
|
||||||
|
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||||
|
char *bufp = &printString[0];
|
||||||
|
char tmpBuf[256];
|
||||||
|
const char trueBuf[] = "true";
|
||||||
|
const char falseBuf[] = "false";
|
||||||
|
|
||||||
|
int argCount = 0;
|
||||||
|
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||||
|
// Format strings are just single percent signs.
|
||||||
|
if (*format != '%') {
|
||||||
|
*bufp++ = *format;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (*types) {
|
||||||
|
void *ptr = args[argCount++];
|
||||||
|
// Based on the encoding in the types string, cast the
|
||||||
|
// value appropriately and print it with a reasonable
|
||||||
|
// printf() formatting string.
|
||||||
|
switch (*types) {
|
||||||
|
case 'b': {
|
||||||
|
const char *tmpBuf1 = *((bool *)ptr) ? trueBuf : falseBuf;
|
||||||
|
APPEND(tmpBuf1);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'B': {
|
||||||
|
*bufp++ = '[';
|
||||||
|
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||||
|
break;
|
||||||
|
for (int i = 0; i < width; ++i) {
|
||||||
|
bool val0 = *((bool*)ptr);
|
||||||
|
bool val = val0; \
|
||||||
|
if (mask & (1ull << i)) {
|
||||||
|
const char *tmpBuf1 = val ? trueBuf : falseBuf;
|
||||||
|
APPEND(tmpBuf1);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
APPEND("_________");
|
||||||
|
*bufp++ = (i != width-1) ? ',' : ']';
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'i': PRINT_SCALAR("%d", int);
|
||||||
|
case 'I': PRINT_VECTOR("%d", int);
|
||||||
|
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||||
|
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||||
|
case 'f': PRINT_SCALAR("%f", float);
|
||||||
|
case 'F': PRINT_VECTOR("%f", float);
|
||||||
|
case 'l': PRINT_SCALAR("%lld", long long);
|
||||||
|
case 'L': PRINT_VECTOR("%lld", long long);
|
||||||
|
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||||
|
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||||
|
case 'd': PRINT_SCALAR("%f", double);
|
||||||
|
case 'D': PRINT_VECTOR("%f", double);
|
||||||
|
case 'p': PRINT_SCALAR("%p", void *);
|
||||||
|
case 'P': PRINT_VECTOR("%p", void *);
|
||||||
|
default:
|
||||||
|
APPEND("UNKNOWN TYPE ");
|
||||||
|
*bufp++ = *types;
|
||||||
|
}
|
||||||
|
++types;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++format;
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
*bufp = '\n'; bufp++;
|
||||||
|
*bufp = '\0';
|
||||||
|
}
|
||||||
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
|||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* this is print for PTX target only */
|
||||||
|
int __puts_nvptx(const char *);
|
||||||
|
void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||||
|
void **args) {
|
||||||
|
#if 0
|
||||||
|
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||||
|
char *bufp = &printString[0];
|
||||||
|
char tmpBuf[256];
|
||||||
|
|
||||||
|
int argCount = 0;
|
||||||
|
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||||
|
// Format strings are just single percent signs.
|
||||||
|
if (*format != '%') {
|
||||||
|
*bufp++ = *format;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (*types) {
|
||||||
|
void *ptr = args[argCount++];
|
||||||
|
// Based on the encoding in the types string, cast the
|
||||||
|
// value appropriately and print it with a reasonable
|
||||||
|
// printf() formatting string.
|
||||||
|
switch (*types) {
|
||||||
|
case 'b': {
|
||||||
|
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||||
|
APPEND(tmpBuf);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'B': {
|
||||||
|
*bufp++ = '[';
|
||||||
|
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||||
|
break;
|
||||||
|
for (int i = 0; i < width; ++i) {
|
||||||
|
if (mask & (1ull << i)) {
|
||||||
|
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||||
|
APPEND(tmpBuf);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
APPEND("_________");
|
||||||
|
*bufp++ = (i != width-1) ? ',' : ']';
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'i': PRINT_SCALAR("%d", int);
|
||||||
|
case 'I': PRINT_VECTOR("%d", int);
|
||||||
|
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||||
|
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||||
|
case 'f': PRINT_SCALAR("%f", float);
|
||||||
|
case 'F': PRINT_VECTOR("%f", float);
|
||||||
|
case 'l': PRINT_SCALAR("%lld", long long);
|
||||||
|
case 'L': PRINT_VECTOR("%lld", long long);
|
||||||
|
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||||
|
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||||
|
case 'd': PRINT_SCALAR("%f", double);
|
||||||
|
case 'D': PRINT_VECTOR("%f", double);
|
||||||
|
case 'p': PRINT_SCALAR("%p", void *);
|
||||||
|
case 'P': PRINT_VECTOR("%p", void *);
|
||||||
|
default:
|
||||||
|
APPEND("UNKNOWN TYPE ");
|
||||||
|
*bufp++ = *types;
|
||||||
|
}
|
||||||
|
++types;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++format;
|
||||||
|
}
|
||||||
|
|
||||||
|
done:
|
||||||
|
*bufp = '\n'; bufp++;
|
||||||
|
*bufp = '\0';
|
||||||
|
__puts_nvptx(printString);
|
||||||
|
#else
|
||||||
|
__puts_nvptx("---nvptx printing is not support---\n");
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int __num_cores() {
|
int __num_cores() {
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
|
|||||||
@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
|||||||
;; int8/int16 builtins
|
;; int8/int16 builtins
|
||||||
|
|
||||||
define_avgs()
|
define_avgs()
|
||||||
|
declare_nvptx()
|
||||||
|
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ packed_load_and_store()
|
|||||||
scans()
|
scans()
|
||||||
int64minmax()
|
int64minmax()
|
||||||
aossoa()
|
aossoa()
|
||||||
|
declare_nvptx()
|
||||||
|
|
||||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||||
;; masked store
|
;; masked store
|
||||||
|
|||||||
@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
|||||||
;; int8/int16 builtins
|
;; int8/int16 builtins
|
||||||
|
|
||||||
define_avgs()
|
define_avgs()
|
||||||
|
declare_nvptx()
|
||||||
|
|||||||
@@ -344,3 +344,4 @@ packed_load_and_store(4)
|
|||||||
;; prefetch
|
;; prefetch
|
||||||
|
|
||||||
define_prefetches()
|
define_prefetches()
|
||||||
|
declare_nvptx()
|
||||||
|
|||||||
2235
builtins/target-nvptx.ll
Normal file
2235
builtins/target-nvptx.ll
Normal file
File diff suppressed because it is too large
Load Diff
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
|||||||
|
|
||||||
define_avgs()
|
define_avgs()
|
||||||
|
|
||||||
|
declare_nvptx()
|
||||||
|
|||||||
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
|||||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||||
ret i64 %call
|
ret i64 %call
|
||||||
}
|
}
|
||||||
|
|
||||||
|
declare_nvptx()
|
||||||
|
|||||||
3417
builtins/util-nvptx.m4
Normal file
3417
builtins/util-nvptx.m4
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
|
|||||||
declare double @__rcp_uniform_double(double)
|
declare double @__rcp_uniform_double(double)
|
||||||
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
|
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
|
||||||
')
|
')
|
||||||
|
|
||||||
|
define(`declare_nvptx',
|
||||||
|
`
|
||||||
|
declare i32 @__program_index() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__program_count() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__warp_index() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_index0() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_index1() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_index2() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_index() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_count0() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_count1() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_count2() nounwind readnone alwaysinline
|
||||||
|
declare i32 @__task_count() nounwind readnone alwaysinline
|
||||||
|
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||||
|
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
|
||||||
|
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||||
|
declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
|
||||||
|
')
|
||||||
|
|
||||||
|
define(`global_atomic_varying',`
|
||||||
|
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||||
|
')
|
||||||
|
|
||||||
|
define(`global_atomic_cas_varying',`
|
||||||
|
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||||
|
')
|
||||||
|
|
||||||
|
global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
|
||||||
|
global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
|
||||||
|
global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
|
||||||
|
global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
|
||||||
|
|
||||||
|
global_atomic_varying(WIDTH, swap, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, swap, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, swap, float, float)
|
||||||
|
global_atomic_varying(WIDTH, swap, double, double)
|
||||||
|
|
||||||
|
global_atomic_varying(WIDTH, add, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, sub, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, and, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, or, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, xor, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, min, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, max, i32, int32)
|
||||||
|
global_atomic_varying(WIDTH, umin, i32, uint32)
|
||||||
|
global_atomic_varying(WIDTH, umax, i32, uint32)
|
||||||
|
|
||||||
|
global_atomic_varying(WIDTH, add, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, sub, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, and, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, or, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, xor, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, min, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, max, i64, int64)
|
||||||
|
global_atomic_varying(WIDTH, umin, i64, uint64)
|
||||||
|
global_atomic_varying(WIDTH, umax, i64, uint64)
|
||||||
|
|||||||
188
ctx.cpp
188
ctx.cpp
@@ -57,6 +57,8 @@
|
|||||||
#include <llvm/IR/Instructions.h>
|
#include <llvm/IR/Instructions.h>
|
||||||
#include <llvm/IR/DerivedTypes.h>
|
#include <llvm/IR/DerivedTypes.h>
|
||||||
#endif
|
#endif
|
||||||
|
#include <llvm/Support/raw_ostream.h>
|
||||||
|
#include <llvm/Support/FormattedStream.h>
|
||||||
|
|
||||||
/** This is a small utility structure that records information related to one
|
/** This is a small utility structure that records information related to one
|
||||||
level of nested control flow. It's mostly used in correctly restoring
|
level of nested control flow. It's mostly used in correctly restoring
|
||||||
@@ -1371,11 +1373,17 @@ FunctionEmitContext::None(llvm::Value *mask) {
|
|||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
FunctionEmitContext::LaneMask(llvm::Value *v) {
|
FunctionEmitContext::LaneMask(llvm::Value *v)
|
||||||
|
{
|
||||||
|
#if 1 /* this makes mandelbrot example slower, why ?!? */
|
||||||
|
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
|
||||||
|
#else
|
||||||
|
const char *__movmsk = "__movmsk";
|
||||||
|
#endif
|
||||||
// Call the target-dependent movmsk function to turn the vector mask
|
// Call the target-dependent movmsk function to turn the vector mask
|
||||||
// into an i64 value
|
// into an i64 value
|
||||||
std::vector<Symbol *> mm;
|
std::vector<Symbol *> mm;
|
||||||
m->symbolTable->LookupFunction("__movmsk", &mm);
|
m->symbolTable->LookupFunction(__movmsk, &mm);
|
||||||
if (g->target->getMaskBitCount() == 1)
|
if (g->target->getMaskBitCount() == 1)
|
||||||
AssertPos(currentPos, mm.size() == 1);
|
AssertPos(currentPos, mm.size() == 1);
|
||||||
else
|
else
|
||||||
@@ -1387,9 +1395,71 @@ FunctionEmitContext::LaneMask(llvm::Value *v) {
|
|||||||
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
|
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
|
||||||
|
{
|
||||||
|
llvm::Type *type = vector->getType();
|
||||||
|
if (type == LLVMTypes::Int8VectorType)
|
||||||
|
funcName += "_int8";
|
||||||
|
else if (type == LLVMTypes::Int16VectorType)
|
||||||
|
funcName += "_int16";
|
||||||
|
else if (type == LLVMTypes::Int32VectorType)
|
||||||
|
funcName += "_int32";
|
||||||
|
else if (type == LLVMTypes::Int64VectorType)
|
||||||
|
funcName += "_int64";
|
||||||
|
else if (type == LLVMTypes::FloatVectorType)
|
||||||
|
funcName += "_float";
|
||||||
|
else if (type == LLVMTypes::DoubleVectorType)
|
||||||
|
funcName += "_double";
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::Value*
|
||||||
|
FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
|
||||||
|
{
|
||||||
|
std::string funcName = "__insert";
|
||||||
|
assert(lAppendInsertExtractName(vector, funcName));
|
||||||
|
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||||
|
|
||||||
|
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||||
|
assert(func != NULL);
|
||||||
|
std::vector<llvm::Value *> args;
|
||||||
|
args.push_back(vector);
|
||||||
|
args.push_back(lane);
|
||||||
|
args.push_back(scalar);
|
||||||
|
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::Value*
|
||||||
|
FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
|
||||||
|
{
|
||||||
|
std::string funcName = "__extract";
|
||||||
|
assert(lAppendInsertExtractName(vector, funcName));
|
||||||
|
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||||
|
|
||||||
|
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||||
|
assert(func != NULL);
|
||||||
|
std::vector<llvm::Value *> args;
|
||||||
|
args.push_back(vector);
|
||||||
|
args.push_back(lane);
|
||||||
|
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
// Compare the two masks to get a vector of i1s
|
||||||
|
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||||
|
v1, v2, "v1==v2");
|
||||||
|
return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
#if 0
|
#if 0
|
||||||
// Compare the two masks to get a vector of i1s
|
// Compare the two masks to get a vector of i1s
|
||||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||||
@@ -1405,6 +1475,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
|||||||
LLVMGetName("equal", v1, v2));
|
LLVMGetName("equal", v1, v2));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
||||||
@@ -1418,6 +1489,17 @@ FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
|||||||
|
|
||||||
return index;
|
return index;
|
||||||
}
|
}
|
||||||
|
llvm::Value *
|
||||||
|
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
|
||||||
|
llvm::Function *func_program_index = m->module->getFunction("__program_index");
|
||||||
|
llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
|
||||||
|
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
|
||||||
|
#if 0
|
||||||
|
if (!is32bits)
|
||||||
|
index = ZExtInst(index, LLVMTypes::Int64VectandType);
|
||||||
|
#endif
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
llvm::Value *
|
llvm::Value *
|
||||||
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
|
|||||||
|
|
||||||
if (name == NULL)
|
if (name == NULL)
|
||||||
name = LLVMGetName(value, "_ptr2int");
|
name = LLVMGetName(value, "_ptr2int");
|
||||||
|
|
||||||
llvm::Type *type = LLVMTypes::PointerIntType;
|
llvm::Type *type = LLVMTypes::PointerIntType;
|
||||||
llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
|
llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
|
||||||
AddDebugPos(inst);
|
AddDebugPos(inst);
|
||||||
@@ -3523,6 +3606,9 @@ llvm::Value *
|
|||||||
FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||||
std::vector<llvm::Value *> &argVals,
|
std::vector<llvm::Value *> &argVals,
|
||||||
llvm::Value *launchCount[3]){
|
llvm::Value *launchCount[3]){
|
||||||
|
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
if (callee == NULL) {
|
if (callee == NULL) {
|
||||||
AssertPos(currentPos, m->errorCount > 0);
|
AssertPos(currentPos, m->errorCount > 0);
|
||||||
return NULL;
|
return NULL;
|
||||||
@@ -3588,10 +3674,96 @@ FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
|||||||
args.push_back(launchCount[2]);
|
args.push_back(launchCount[2]);
|
||||||
return CallInst(flaunch, NULL, args, "");
|
return CallInst(flaunch, NULL, args, "");
|
||||||
}
|
}
|
||||||
|
else /* NVPTX */
|
||||||
|
{
|
||||||
|
if (callee == NULL) {
|
||||||
|
AssertPos(currentPos, m->errorCount > 0);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
launchedTasks = true;
|
||||||
|
|
||||||
|
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||||
|
std::vector<llvm::Type*> argTypes;
|
||||||
|
|
||||||
|
llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
|
||||||
|
const unsigned int nArgs = F->arg_size();
|
||||||
|
llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
|
||||||
|
for (; I != E; ++I)
|
||||||
|
argTypes.push_back(I->getType());
|
||||||
|
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
|
||||||
|
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
|
||||||
|
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||||
|
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||||
|
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||||
|
"struct_size_to_64");
|
||||||
|
|
||||||
|
const int align = 8;
|
||||||
|
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||||
|
AssertPos(currentPos, falloc != NULL);
|
||||||
|
std::vector<llvm::Value *> allocArgs;
|
||||||
|
allocArgs.push_back(launchGroupHandlePtr);
|
||||||
|
allocArgs.push_back(structSize);
|
||||||
|
allocArgs.push_back(LLVMInt32(align));
|
||||||
|
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||||
|
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
|
||||||
|
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
|
||||||
|
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
|
||||||
|
|
||||||
|
/* check if the pointer returned by ISPCAlloc is not NULL
|
||||||
|
* --------------
|
||||||
|
* this is a workaround for not checking the value of programIndex
|
||||||
|
* because ISPCAlloc will return NULL pointer for all programIndex > 0
|
||||||
|
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
|
||||||
|
* will also be NULL
|
||||||
|
* This check must be added, and also rewrite the code to make it less opaque
|
||||||
|
*/
|
||||||
|
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
|
||||||
|
BranchInst(if_true, if_false, cmp1);
|
||||||
|
|
||||||
|
/**********************/
|
||||||
|
bblock = if_true;
|
||||||
|
|
||||||
|
// label_if_then block:
|
||||||
|
llvm::Type *pt = llvm::PointerType::getUnqual(st);
|
||||||
|
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||||
|
for (unsigned int i = 0; i < argVals.size(); ++i)
|
||||||
|
{
|
||||||
|
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||||
|
// don't need to do masked store here, I think
|
||||||
|
StoreInst(argVals[i], ptr);
|
||||||
|
}
|
||||||
|
if (nArgs == argVals.size() + 1) {
|
||||||
|
// copy in the mask
|
||||||
|
llvm::Value *mask = GetFullMask();
|
||||||
|
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||||
|
"funarg_mask");
|
||||||
|
StoreInst(mask, ptr);
|
||||||
|
}
|
||||||
|
BranchInst(if_false);
|
||||||
|
|
||||||
|
/**********************/
|
||||||
|
bblock = if_false;
|
||||||
|
|
||||||
|
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||||
|
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||||
|
AssertPos(currentPos, flaunch != NULL);
|
||||||
|
std::vector<llvm::Value *> args;
|
||||||
|
args.push_back(launchGroupHandlePtr);
|
||||||
|
args.push_back(fptr);
|
||||||
|
args.push_back(voidmem);
|
||||||
|
args.push_back(launchCount[0]);
|
||||||
|
args.push_back(launchCount[1]);
|
||||||
|
args.push_back(launchCount[2]);
|
||||||
|
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void
|
void
|
||||||
FunctionEmitContext::SyncInst() {
|
FunctionEmitContext::SyncInst() {
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||||
llvm::Value *nullPtrValue =
|
llvm::Value *nullPtrValue =
|
||||||
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||||
@@ -3616,6 +3788,18 @@ FunctionEmitContext::SyncInst() {
|
|||||||
|
|
||||||
SetCurrentBasicBlock(bPostSync);
|
SetCurrentBasicBlock(bPostSync);
|
||||||
}
|
}
|
||||||
|
else /* NVPTX: don't do test, just call sync */
|
||||||
|
{
|
||||||
|
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||||
|
llvm::Value *nullPtrValue =
|
||||||
|
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||||
|
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||||
|
if (fsync == NULL)
|
||||||
|
FATAL("Couldn't find ISPCSync declaration?!");
|
||||||
|
CallInst(fsync, NULL, launchGroupHandle, "");
|
||||||
|
StoreInst(nullPtrValue, launchGroupHandlePtr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** When we gathering from or scattering to a varying atomic type, we need
|
/** When we gathering from or scattering to a varying atomic type, we need
|
||||||
|
|||||||
8
ctx.h
8
ctx.h
@@ -291,6 +291,13 @@ public:
|
|||||||
of the mask is on. */
|
of the mask is on. */
|
||||||
llvm::Value *LaneMask(llvm::Value *mask);
|
llvm::Value *LaneMask(llvm::Value *mask);
|
||||||
|
|
||||||
|
|
||||||
|
/** Issues a call to __insert_int8/int16/int32/int64/float/double */
|
||||||
|
llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
|
||||||
|
/** Issues a call to __extract_int8/int16/int32/int64/float/double */
|
||||||
|
llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
|
||||||
|
|
||||||
|
|
||||||
/** Given two masks of type LLVMTypes::MaskType, return an i1 value
|
/** Given two masks of type LLVMTypes::MaskType, return an i1 value
|
||||||
that indicates whether the two masks are equal. */
|
that indicates whether the two masks are equal. */
|
||||||
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
|
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
|
||||||
@@ -298,6 +305,7 @@ public:
|
|||||||
/** Generate ConstantVector, which contains ProgramIndex, i.e.
|
/** Generate ConstantVector, which contains ProgramIndex, i.e.
|
||||||
< i32 0, i32 1, i32 2, i32 3> */
|
< i32 0, i32 1, i32 2, i32 3> */
|
||||||
llvm::Value *ProgramIndexVector(bool is32bits = true);
|
llvm::Value *ProgramIndexVector(bool is32bits = true);
|
||||||
|
llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
|
||||||
|
|
||||||
/** Given a string, create an anonymous global variable to hold its
|
/** Given a string, create an anonymous global variable to hold its
|
||||||
value and return the pointer to the string. */
|
value and return the pointer to the string. */
|
||||||
|
|||||||
16
decl.cpp
16
decl.cpp
@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
|
|||||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||||
|
|
||||||
if (soaWidth > 0) {
|
if (soaWidth > 0) {
|
||||||
|
#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const StructType *st = CastType<StructType>(retType);
|
const StructType *st = CastType<StructType>(retType);
|
||||||
|
|
||||||
if (st == NULL) {
|
if (st == NULL) {
|
||||||
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0 /* NVPTX */
|
||||||
|
if (baseType->IsUniformType())
|
||||||
|
{
|
||||||
|
fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize,
|
||||||
|
baseType->IsArrayType() ? " true " : " false ");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||||
if (child != NULL) {
|
if (child != NULL) {
|
||||||
child->InitFromType(arrayType, ds);
|
child->InitFromType(arrayType, ds);
|
||||||
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
|||||||
|
|
||||||
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
||||||
|
|
||||||
|
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||||
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
|
||||||
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||||
|
|
||||||
if (isExported && isTask) {
|
if (isExported && isTask) {
|
||||||
|
|||||||
11
expr.cpp
11
expr.cpp
@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
|
|||||||
"struct type \"%s\".", type->GetString().c_str());
|
"struct type \"%s\".", type->GetString().c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (type != NULL)
|
||||||
|
if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
|
||||||
|
{
|
||||||
|
Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
|
|||||||
AssertPos(pos, m->errorCount > 0);
|
AssertPos(pos, m->errorCount > 0);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
|
||||||
|
{
|
||||||
|
Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
if (CastType<UndefinedStructType>(allocType) != NULL) {
|
if (CastType<UndefinedStructType>(allocType) != NULL) {
|
||||||
Error(pos, "Can't dynamically allocate storage for declared "
|
Error(pos, "Can't dynamically allocate storage for declared "
|
||||||
"but not defined type \"%s\".", allocType->GetString().c_str());
|
"but not defined type \"%s\".", allocType->GetString().c_str());
|
||||||
|
|||||||
41
func.cpp
41
func.cpp
@@ -47,6 +47,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||||
|
#include <llvm/Metadata.h>
|
||||||
#include <llvm/LLVMContext.h>
|
#include <llvm/LLVMContext.h>
|
||||||
#include <llvm/Module.h>
|
#include <llvm/Module.h>
|
||||||
#include <llvm/Type.h>
|
#include <llvm/Type.h>
|
||||||
@@ -54,6 +55,7 @@
|
|||||||
#include <llvm/Intrinsics.h>
|
#include <llvm/Intrinsics.h>
|
||||||
#include <llvm/DerivedTypes.h>
|
#include <llvm/DerivedTypes.h>
|
||||||
#else
|
#else
|
||||||
|
#include <llvm/IR/Metadata.h>
|
||||||
#include <llvm/IR/LLVMContext.h>
|
#include <llvm/IR/LLVMContext.h>
|
||||||
#include <llvm/IR/Module.h>
|
#include <llvm/IR/Module.h>
|
||||||
#include <llvm/IR/Type.h>
|
#include <llvm/IR/Type.h>
|
||||||
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
|
|||||||
sym->parentFunction = this;
|
sym->parentFunction = this;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type->isTask) {
|
if (type->isTask && g->target->getISA() != Target::NVPTX) {
|
||||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||||
Assert(threadIndexSym);
|
Assert(threadIndexSym);
|
||||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
||||||
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|||||||
#endif
|
#endif
|
||||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||||
Assert(type != NULL);
|
Assert(type != NULL);
|
||||||
if (type->isTask == true) {
|
if (type->isTask == true && g->target->getISA() != Target::NVPTX) {
|
||||||
// For tasks, there should always be three parameters: the
|
// For tasks, there should always be three parameters: the
|
||||||
// pointer to the structure that holds all of the arguments, the
|
// pointer to the structure that holds all of the arguments, the
|
||||||
// thread index, and the thread count variables.
|
// thread index, and the thread count variables.
|
||||||
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
|||||||
ctx->SetFunctionMask(argIter);
|
ctx->SetFunctionMask(argIter);
|
||||||
Assert(++argIter == function->arg_end());
|
Assert(++argIter == function->arg_end());
|
||||||
}
|
}
|
||||||
|
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
llvm::NamedMDNode* annotations =
|
||||||
|
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||||
|
llvm::SmallVector<llvm::Value*, 3> av;
|
||||||
|
av.push_back(function);
|
||||||
|
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||||
|
av.push_back(LLVMInt32(1));
|
||||||
|
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally, we can generate code for the function
|
// Finally, we can generate code for the function
|
||||||
@@ -497,8 +509,23 @@ Function::GenerateIR() {
|
|||||||
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
|
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
|
||||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||||
std::string functionName = sym->name;
|
std::string functionName = sym->name;
|
||||||
|
|
||||||
if (g->mangleFunctionsWithTarget)
|
if (g->mangleFunctionsWithTarget)
|
||||||
functionName += std::string("_") + g->target->GetISAString();
|
functionName += std::string("_") + g->target->GetISAString();
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
|
||||||
|
#if 0
|
||||||
|
llvm::NamedMDNode* annotations =
|
||||||
|
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||||
|
llvm::SmallVector<llvm::Value*, 3> av;
|
||||||
|
av.push_back(function);
|
||||||
|
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||||
|
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||||
|
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
llvm::Function *appFunction =
|
llvm::Function *appFunction =
|
||||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||||
#if defined(LLVM_3_1)
|
#if defined(LLVM_3_1)
|
||||||
@@ -538,6 +565,16 @@ Function::GenerateIR() {
|
|||||||
FATAL("Function verificication failed");
|
FATAL("Function verificication failed");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
llvm::NamedMDNode* annotations =
|
||||||
|
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||||
|
llvm::SmallVector<llvm::Value*, 3> av;
|
||||||
|
av.push_back(appFunction);
|
||||||
|
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||||
|
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||||
|
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
25
ispc.cpp
25
ispc.cpp
@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
arch = "arm";
|
arch = "arm";
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
|
if(!strncmp(isa, "nvptx", 5))
|
||||||
|
arch = "nvptx64";
|
||||||
|
else
|
||||||
arch = "x86-64";
|
arch = "x86-64";
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
this->m_maskBitCount = 32;
|
this->m_maskBitCount = 32;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
else if (!strcasecmp(isa, "nvptx"))
|
||||||
|
{
|
||||||
|
this->m_isa = Target::NVPTX;
|
||||||
|
this->m_cpu = "sm_35";
|
||||||
|
this->m_nativeVectorWidth = 32;
|
||||||
|
this->m_nativeVectorAlignment = 32;
|
||||||
|
this->m_vectorWidth = 1;
|
||||||
|
this->m_hasHalf = true;
|
||||||
|
this->m_maskingIsFree = true;
|
||||||
|
this->m_maskBitCount = 1;
|
||||||
|
this->m_hasTranscendentals = false;
|
||||||
|
this->m_hasGather = this->m_hasScatter = false;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
|
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
|
||||||
isa, SupportedTargets());
|
isa, SupportedTargets());
|
||||||
@@ -784,6 +800,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
|||||||
// Initialize target-specific "target-feature" attribute.
|
// Initialize target-specific "target-feature" attribute.
|
||||||
if (!m_attributes.empty()) {
|
if (!m_attributes.empty()) {
|
||||||
llvm::AttrBuilder attrBuilder;
|
llvm::AttrBuilder attrBuilder;
|
||||||
|
if (m_isa != Target::NVPTX)
|
||||||
attrBuilder.addAttribute("target-cpu", this->m_cpu);
|
attrBuilder.addAttribute("target-cpu", this->m_cpu);
|
||||||
attrBuilder.addAttribute("target-features", this->m_attributes);
|
attrBuilder.addAttribute("target-features", this->m_attributes);
|
||||||
this->m_tf_attributes = new llvm::AttributeSet(
|
this->m_tf_attributes = new llvm::AttributeSet(
|
||||||
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
|
|||||||
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
|
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
|
||||||
"avx2-i32x8, avx2-i32x16, avx2-i64x4, "
|
"avx2-i32x8, avx2-i32x16, avx2-i64x4, "
|
||||||
"generic-x1, generic-x4, generic-x8, generic-x16, "
|
"generic-x1, generic-x4, generic-x8, generic-x16, "
|
||||||
"generic-x32, generic-x64";
|
"generic-x32, generic-x64, nvptx";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -866,6 +883,8 @@ Target::GetTripleString() const {
|
|||||||
triple.setArchName("i386");
|
triple.setArchName("i386");
|
||||||
else if (m_arch == "x86-64")
|
else if (m_arch == "x86-64")
|
||||||
triple.setArchName("x86_64");
|
triple.setArchName("x86_64");
|
||||||
|
else if (m_arch == "nvptx64")
|
||||||
|
triple = llvm::Triple("nvptx64", "nvidia", "cuda");
|
||||||
else
|
else
|
||||||
triple.setArchName(m_arch);
|
triple.setArchName(m_arch);
|
||||||
}
|
}
|
||||||
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
|
|||||||
return "avx2";
|
return "avx2";
|
||||||
case Target::GENERIC:
|
case Target::GENERIC:
|
||||||
return "generic";
|
return "generic";
|
||||||
|
case Target::NVPTX:
|
||||||
|
return "nvptx";
|
||||||
default:
|
default:
|
||||||
FATAL("Unhandled target in ISAToString()");
|
FATAL("Unhandled target in ISAToString()");
|
||||||
}
|
}
|
||||||
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
|
|||||||
return "avx2-i32x8";
|
return "avx2-i32x8";
|
||||||
case Target::GENERIC:
|
case Target::GENERIC:
|
||||||
return "generic-4";
|
return "generic-4";
|
||||||
|
case Target::NVPTX:
|
||||||
|
return "nvptx";
|
||||||
default:
|
default:
|
||||||
FATAL("Unhandled target in ISAToTargetString()");
|
FATAL("Unhandled target in ISAToTargetString()");
|
||||||
}
|
}
|
||||||
|
|||||||
3
ispc.h
3
ispc.h
@@ -179,7 +179,7 @@ public:
|
|||||||
flexible/performant of them will apear last in the enumerant. Note
|
flexible/performant of them will apear last in the enumerant. Note
|
||||||
also that __best_available_isa() needs to be updated if ISAs are
|
also that __best_available_isa() needs to be updated if ISAs are
|
||||||
added or the enumerant values are reordered. */
|
added or the enumerant values are reordered. */
|
||||||
enum ISA {
|
enum ISA { NVPTX,
|
||||||
#ifdef ISPC_ARM_ENABLED
|
#ifdef ISPC_ARM_ENABLED
|
||||||
NEON32, NEON16, NEON8,
|
NEON32, NEON16, NEON8,
|
||||||
#endif
|
#endif
|
||||||
@@ -606,6 +606,7 @@ struct Globals {
|
|||||||
/** Indicates that alignment in memory allocation routines should be
|
/** Indicates that alignment in memory allocation routines should be
|
||||||
forced to have given value. -1 value means natural alignment for the platforms. */
|
forced to have given value. -1 value means natural alignment for the platforms. */
|
||||||
int forceAlignment;
|
int forceAlignment;
|
||||||
|
std::string PtxString;
|
||||||
};
|
};
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
|||||||
5
main.cpp
5
main.cpp
@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
|
|||||||
LLVMInitializeARMTargetMC();
|
LLVMInitializeARMTargetMC();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
LLVMInitializeNVPTXTargetInfo();
|
||||||
|
LLVMInitializeNVPTXTarget();
|
||||||
|
LLVMInitializeNVPTXAsmPrinter();
|
||||||
|
LLVMInitializeNVPTXTargetMC();
|
||||||
|
|
||||||
char *file = NULL;
|
char *file = NULL;
|
||||||
const char *headerFileName = NULL;
|
const char *headerFileName = NULL;
|
||||||
const char *outFileName = NULL;
|
const char *outFileName = NULL;
|
||||||
|
|||||||
139
module.cpp
139
module.cpp
@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX &&
|
||||||
|
#if 0
|
||||||
|
!type->IsConstType() &&
|
||||||
|
#endif
|
||||||
|
#if 1
|
||||||
|
at != NULL &&
|
||||||
|
#endif
|
||||||
|
type->IsVaryingType())
|
||||||
|
{
|
||||||
|
Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
|
||||||
|
return;
|
||||||
|
#if 0
|
||||||
|
int nel = 32; /* warp-size */
|
||||||
|
if (type->IsArrayType())
|
||||||
|
{
|
||||||
|
const ArrayType *at = CastType<ArrayType>(type);
|
||||||
|
/* we must scale # elements by 4, because a thread-block will run 4 warps
|
||||||
|
* or 128 threads.
|
||||||
|
* ***note-to-me***:please define these value (128threads/4warps)
|
||||||
|
* in nvptx-target definition
|
||||||
|
* instead of compile-time constants
|
||||||
|
*/
|
||||||
|
nel *= at->GetElementCount();
|
||||||
|
assert (!type->IsSOAType());
|
||||||
|
type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
type = new ArrayType(type->GetAsUniformType(), nel);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
llvm::Type *llvmType = type->LLVMType(g->ctx);
|
llvm::Type *llvmType = type->LLVMType(g->ctx);
|
||||||
if (llvmType == NULL)
|
if (llvmType == NULL)
|
||||||
return;
|
return;
|
||||||
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
lCheckTaskParameterTypes(const Type *type, const std::string &name,
|
||||||
|
SourcePos pos) {
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
return;
|
||||||
|
if (lRecursiveCheckValidParamType(type, false) == false) {
|
||||||
|
if (CastType<VectorType>(type))
|
||||||
|
Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
|
||||||
|
"function with \"nvptx\" target.", name.c_str());
|
||||||
|
else
|
||||||
|
Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
|
||||||
|
name.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Given a function type, loop through the function parameters and see if
|
/** Given a function type, loop through the function parameters and see if
|
||||||
any are StructTypes. If so, issue an error; this is currently broken
|
any are StructTypes. If so, issue an error; this is currently broken
|
||||||
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
|
|||||||
#else // LLVM 3.1 and 3.3+
|
#else // LLVM 3.1 and 3.3+
|
||||||
function->addFnAttr(llvm::Attribute::AlwaysInline);
|
function->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||||
#endif
|
#endif
|
||||||
if (functionType->isTask)
|
/* evghenii: fails function verification when "if" executed in nvptx target */
|
||||||
|
if (functionType->isTask && g->target->getISA() != Target::NVPTX)
|
||||||
// This also applies transitively to members I think?
|
// This also applies transitively to members I think?
|
||||||
#if defined(LLVM_3_1)
|
#if defined(LLVM_3_1)
|
||||||
function->setDoesNotAlias(1, true);
|
function->setDoesNotAlias(1, true);
|
||||||
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
|
|||||||
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
|
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
|
||||||
Error(pos, "Task-qualified functions must have void return type.");
|
Error(pos, "Task-qualified functions must have void return type.");
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX &&
|
||||||
|
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
|
||||||
|
functionType->isExported)
|
||||||
|
{
|
||||||
|
Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
|
||||||
|
}
|
||||||
|
|
||||||
if (functionType->isExported || functionType->isExternC)
|
if (functionType->isExported || functionType->isExternC)
|
||||||
lCheckForStructParameters(functionType, pos);
|
lCheckForStructParameters(functionType, pos);
|
||||||
|
|
||||||
@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
|
|||||||
if (functionType->isExported) {
|
if (functionType->isExported) {
|
||||||
lCheckExportedParameterTypes(argType, argName, argPos);
|
lCheckExportedParameterTypes(argType, argName, argPos);
|
||||||
}
|
}
|
||||||
|
if (functionType->isTask) {
|
||||||
|
lCheckTaskParameterTypes(argType, argName, argPos);
|
||||||
|
}
|
||||||
|
|
||||||
// ISPC assumes that no pointers alias. (It should be possible to
|
// ISPC assumes that no pointers alias. (It should be possible to
|
||||||
// specify when this is not the case, but this should be the
|
// specify when this is not the case, but this should be the
|
||||||
@@ -959,8 +1017,14 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
|
|||||||
const char *fileType = NULL;
|
const char *fileType = NULL;
|
||||||
switch (outputType) {
|
switch (outputType) {
|
||||||
case Asm:
|
case Asm:
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
if (strcasecmp(suffix, "s"))
|
if (strcasecmp(suffix, "s"))
|
||||||
fileType = "assembly";
|
fileType = "assembly";
|
||||||
|
}
|
||||||
|
else
|
||||||
|
if (strcasecmp(suffix, "ptx"))
|
||||||
|
fileType = "assembly";
|
||||||
break;
|
break;
|
||||||
case Bitcode:
|
case Bitcode:
|
||||||
if (strcasecmp(suffix, "bc"))
|
if (strcasecmp(suffix, "bc"))
|
||||||
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llvm::raw_fd_ostream fos(fd, (fd != 1), false);
|
llvm::raw_fd_ostream fos(fd, (fd != 1), false);
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
|
||||||
|
module->setDataLayout(dl_string);
|
||||||
|
}
|
||||||
llvm::WriteBitcodeToFile(module, fos);
|
llvm::WriteBitcodeToFile(module, fos);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
|
|||||||
opts.addMacroDef(g->cppArgs[i].substr(2));
|
opts.addMacroDef(g->cppArgs[i].substr(2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
opts.addMacroDef("__NVPTX__");
|
||||||
|
opts.addMacroDef("programIndex=__programIndex()");
|
||||||
|
opts.addMacroDef("cif=if");
|
||||||
|
opts.addMacroDef("cfor=for");
|
||||||
|
opts.addMacroDef("cwhile=while");
|
||||||
|
opts.addMacroDef("ccontinue=continue");
|
||||||
|
opts.addMacroDef("cdo=do");
|
||||||
|
opts.addMacroDef("taskIndex0=__taskIndex0()");
|
||||||
|
opts.addMacroDef("taskIndex1=__taskIndex1()");
|
||||||
|
opts.addMacroDef("taskIndex2=__taskIndex2()");
|
||||||
|
opts.addMacroDef("taskIndex=__taskIndex()");
|
||||||
|
opts.addMacroDef("taskCount0=__taskCount0()");
|
||||||
|
opts.addMacroDef("taskCount1=__taskCount1()");
|
||||||
|
opts.addMacroDef("taskCount2=__taskCount2()");
|
||||||
|
opts.addMacroDef("taskCount=__taskCount()");
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(LLVM_3_1)
|
#if defined(LLVM_3_1)
|
||||||
inst.getLangOpts().BCPLComment = 1;
|
inst.getLangOpts().BCPLComment = 1;
|
||||||
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
|
|||||||
return module;
|
return module;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string lCBEMangle(const std::string &S) {
|
||||||
|
std::string Result;
|
||||||
|
|
||||||
|
for (unsigned i = 0, e = S.size(); i != e; ++i) {
|
||||||
|
if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
|
||||||
|
(S[i] == '<' && S[i+1] == '<'))) {
|
||||||
|
Result += '_';
|
||||||
|
Result += 'A'+(S[i]&15);
|
||||||
|
Result += 'A'+((S[i]>>4)&15);
|
||||||
|
Result += '_';
|
||||||
|
i++;
|
||||||
|
} else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
|
||||||
|
Result += S[i];
|
||||||
|
} else {
|
||||||
|
Result += '_';
|
||||||
|
Result += 'A'+(S[i]&15);
|
||||||
|
Result += 'A'+((S[i]>>4)&15);
|
||||||
|
Result += '_';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
Module::CompileAndOutput(const char *srcFile,
|
Module::CompileAndOutput(const char *srcFile,
|
||||||
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,
|
|||||||
|
|
||||||
m = new Module(srcFile);
|
m = new Module(srcFile);
|
||||||
if (m->CompileFile() == 0) {
|
if (m->CompileFile() == 0) {
|
||||||
|
|
||||||
|
/* NVPTX:
|
||||||
|
* for PTX target replace '.' with '_' in all global variables
|
||||||
|
* a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
|
||||||
|
*/
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
/* mangle global variables names */
|
||||||
|
{
|
||||||
|
llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
|
||||||
|
for (; I != E; I++)
|
||||||
|
I->setName(lCBEMangle(I->getName()));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* mangle functions names */
|
||||||
|
{
|
||||||
|
llvm::Module::iterator I = m->module->begin(), E = m->module->end();
|
||||||
|
for (; I != E; I++)
|
||||||
|
{
|
||||||
|
std::string str = I->getName();
|
||||||
|
if (str.find("operator") != std::string::npos)
|
||||||
|
I->setName(lCBEMangle(str));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (outputType == CXX) {
|
if (outputType == CXX) {
|
||||||
if (target == NULL || strncmp(target, "generic-", 8) != 0) {
|
if (target == NULL || strncmp(target, "generic-", 8) != 0) {
|
||||||
Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
|
Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
|
||||||
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,
|
|||||||
|
|
||||||
return errorCount > 0;
|
return errorCount > 0;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
20
nvptxcc
Executable file
20
nvptxcc
Executable file
@@ -0,0 +1,20 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
|
||||||
|
PTXCC=ptxcc
|
||||||
|
ARGS=${@:2}
|
||||||
|
if [ "$NVVM" == "1" ];
|
||||||
|
then
|
||||||
|
LLVM32=$HOME/usr/local/llvm/bin-3.2
|
||||||
|
LLVMDIS=$LLVM32/bin/llvm-dis
|
||||||
|
PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
|
||||||
|
$($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
|
||||||
|
$($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
|
||||||
|
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
|
||||||
|
else
|
||||||
|
$($PTXCC $1 -o $1.o -Xnvcc="-G") && \
|
||||||
|
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
174
opt.cpp
174
opt.cpp
@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
|
|||||||
static llvm::Pass *CreateReplaceStdlibShiftPass();
|
static llvm::Pass *CreateReplaceStdlibShiftPass();
|
||||||
|
|
||||||
static llvm::Pass *CreateFixBooleanSelectPass();
|
static llvm::Pass *CreateFixBooleanSelectPass();
|
||||||
|
static llvm::Pass *CreatePromoteLocalToPrivatePass();
|
||||||
|
|
||||||
#define DEBUG_START_PASS(NAME) \
|
#define DEBUG_START_PASS(NAME) \
|
||||||
if (g->debugPrint && \
|
if (g->debugPrint && \
|
||||||
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
// run absolutely no optimizations, since the front-end needs us to
|
// run absolutely no optimizations, since the front-end needs us to
|
||||||
// take the various __pseudo_* functions it has emitted and turn
|
// take the various __pseudo_* functions it has emitted and turn
|
||||||
// them into something that can actually execute.
|
// them into something that can actually execute.
|
||||||
|
|
||||||
|
if (g->opt.disableGatherScatterOptimizations == false &&
|
||||||
|
g->target->getVectorWidth() > 1)
|
||||||
optPM.add(CreateImproveMemoryOpsPass(), 100);
|
optPM.add(CreateImproveMemoryOpsPass(), 100);
|
||||||
|
|
||||||
if (g->opt.disableHandlePseudoMemoryOps == false)
|
if (g->opt.disableHandlePseudoMemoryOps == false)
|
||||||
optPM.add(CreateReplacePseudoMemoryOpsPass());
|
optPM.add(CreateReplacePseudoMemoryOpsPass());
|
||||||
|
|
||||||
@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
llvm::initializeInstrumentation(*registry);
|
llvm::initializeInstrumentation(*registry);
|
||||||
llvm::initializeTarget(*registry);
|
llvm::initializeTarget(*registry);
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
optPM.add(CreatePromoteLocalToPrivatePass());
|
||||||
optPM.add(llvm::createGlobalDCEPass(), 185);
|
optPM.add(llvm::createGlobalDCEPass(), 185);
|
||||||
|
|
||||||
// Setup to use LLVM default AliasAnalysis
|
// Setup to use LLVM default AliasAnalysis
|
||||||
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
optPM.add(llvm::createGlobalOptimizerPass());
|
optPM.add(llvm::createGlobalOptimizerPass());
|
||||||
optPM.add(llvm::createReassociatePass());
|
optPM.add(llvm::createReassociatePass());
|
||||||
optPM.add(llvm::createIPConstantPropagationPass());
|
optPM.add(llvm::createIPConstantPropagationPass());
|
||||||
|
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||||
|
|
||||||
optPM.add(llvm::createDeadArgEliminationPass(),230);
|
optPM.add(llvm::createDeadArgEliminationPass(),230);
|
||||||
optPM.add(llvm::createInstructionCombiningPass());
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
optPM.add(llvm::createCFGSimplificationPass());
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {
|
|||||||
|
|
||||||
// Should be the last
|
// Should be the last
|
||||||
optPM.add(CreateFixBooleanSelectPass(), 400);
|
optPM.add(CreateFixBooleanSelectPass(), 400);
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
optPM.add(llvm::createGlobalDCEPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
|
||||||
|
optPM.add(llvm::createBasicAliasAnalysisPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
// Here clang has an experimental pass SROAPass instead of
|
||||||
|
// ScalarReplAggregatesPass. We should add it in the future.
|
||||||
|
optPM.add(llvm::createScalarReplAggregatesPass());
|
||||||
|
optPM.add(llvm::createEarlyCSEPass());
|
||||||
|
optPM.add(llvm::createLowerExpectIntrinsicPass());
|
||||||
|
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
|
||||||
|
optPM.add(llvm::createBasicAliasAnalysisPass());
|
||||||
|
|
||||||
|
// Early optimizations to try to reduce the total amount of code to
|
||||||
|
// work with if we can
|
||||||
|
optPM.add(llvm::createReassociatePass());
|
||||||
|
optPM.add(llvm::createConstantPropagationPass());
|
||||||
|
optPM.add(llvm::createDeadInstEliminationPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||||
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
|
||||||
|
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createDeadInstEliminationPass());
|
||||||
|
|
||||||
|
// On to more serious optimizations
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||||
|
optPM.add(llvm::createGlobalOptimizerPass());
|
||||||
|
optPM.add(llvm::createReassociatePass());
|
||||||
|
optPM.add(llvm::createIPConstantPropagationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createDeadArgEliminationPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
optPM.add(llvm::createPruneEHPass());
|
||||||
|
optPM.add(llvm::createFunctionAttrsPass());
|
||||||
|
optPM.add(llvm::createFunctionInliningPass());
|
||||||
|
optPM.add(llvm::createConstantPropagationPass());
|
||||||
|
optPM.add(llvm::createDeadInstEliminationPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createArgumentPromotionPass());
|
||||||
|
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
|
||||||
|
// Starting from 3.4 this functionality was moved to
|
||||||
|
// InstructionCombiningPass. See r184459 for details.
|
||||||
|
optPM.add(llvm::createSimplifyLibCallsPass());
|
||||||
|
#endif
|
||||||
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createJumpThreadingPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createTailCallEliminationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createFunctionInliningPass());
|
||||||
|
optPM.add(llvm::createConstantPropagationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createIPSCCPPass());
|
||||||
|
optPM.add(llvm::createDeadArgEliminationPass());
|
||||||
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
|
||||||
|
optPM.add(llvm::createFunctionInliningPass());
|
||||||
|
optPM.add(llvm::createArgumentPromotionPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
optPM.add(llvm::createReassociatePass());
|
||||||
|
optPM.add(llvm::createLoopRotatePass());
|
||||||
|
optPM.add(llvm::createLICMPass());
|
||||||
|
// optPM.add(llvm::createLoopUnswitchPass(false));
|
||||||
|
#if 1
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createIndVarSimplifyPass());
|
||||||
|
optPM.add(llvm::createLoopIdiomPass());
|
||||||
|
optPM.add(llvm::createLoopDeletionPass());
|
||||||
|
optPM.add(llvm::createLoopUnrollPass());
|
||||||
|
optPM.add(llvm::createGVNPass());
|
||||||
|
optPM.add(llvm::createMemCpyOptPass());
|
||||||
|
optPM.add(llvm::createSCCPPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createJumpThreadingPass());
|
||||||
|
optPM.add(llvm::createCorrelatedValuePropagationPass());
|
||||||
|
optPM.add(llvm::createDeadStoreEliminationPass());
|
||||||
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
optPM.add(llvm::createCFGSimplificationPass());
|
||||||
|
optPM.add(llvm::createInstructionCombiningPass());
|
||||||
|
optPM.add(llvm::createFunctionInliningPass());
|
||||||
|
optPM.add(llvm::createAggressiveDCEPass());
|
||||||
|
optPM.add(llvm::createStripDeadPrototypesPass());
|
||||||
|
optPM.add(llvm::createGlobalDCEPass());
|
||||||
|
optPM.add(llvm::createConstantMergePass());
|
||||||
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finish up by making sure we didn't mess anything up in the IR along
|
// Finish up by making sure we didn't mess anything up in the IR along
|
||||||
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
|
|||||||
return new FixBooleanSelectPass();
|
return new FixBooleanSelectPass();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Detect addrspace(3)
|
||||||
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static char ID; // Pass identification, replacement for typeid
|
||||||
|
PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
|
||||||
|
|
||||||
|
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
||||||
|
};
|
||||||
|
|
||||||
|
char PromoteLocalToPrivatePass::ID = 0;
|
||||||
|
|
||||||
|
bool
|
||||||
|
PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
|
||||||
|
{
|
||||||
|
std::vector<llvm::AllocaInst*> Allocas;
|
||||||
|
|
||||||
|
bool modifiedAny = false;
|
||||||
|
|
||||||
|
llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
|
||||||
|
|
||||||
|
// Find allocas that are safe to promote, by looking at all instructions in
|
||||||
|
// the entry node
|
||||||
|
for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
|
||||||
|
{
|
||||||
|
llvm::Instruction *inst = &*I;
|
||||||
|
if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
|
||||||
|
{
|
||||||
|
llvm::Function *func = ci->getCalledFunction();
|
||||||
|
if (cvtFunc && (cvtFunc == func))
|
||||||
|
{
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr , "--found cvt-- name= %s \n",
|
||||||
|
I->getName().str().c_str());
|
||||||
|
#endif
|
||||||
|
llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
|
||||||
|
assert(alloca != NULL);
|
||||||
|
#if 0
|
||||||
|
const int align = 8; // g->target->getNativeVectorAlignment();
|
||||||
|
alloca->setAlignment(align);
|
||||||
|
#endif
|
||||||
|
ci->replaceAllUsesWith(alloca);
|
||||||
|
modifiedAny = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return modifiedAny;
|
||||||
|
}
|
||||||
|
|
||||||
|
static llvm::Pass *
|
||||||
|
CreatePromoteLocalToPrivatePass() {
|
||||||
|
return new PromoteLocalToPrivatePass();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
14
ptxtestcc.sh
Executable file
14
ptxtestcc.sh
Executable file
@@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
|
||||||
|
DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
|
||||||
|
|
||||||
|
ISPC=ispc
|
||||||
|
PTXCC=ptxcc
|
||||||
|
PTXGEN=~/ptxgen
|
||||||
|
$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
|
||||||
|
#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
|
||||||
|
$($PTXCC $1.ptx -Xptxas=-v -o $1.ptx.o) && \
|
||||||
|
nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
37
run_tests.py
37
run_tests.py
@@ -204,6 +204,8 @@ def run_test(testname):
|
|||||||
return (1, 0)
|
return (1, 0)
|
||||||
else:
|
else:
|
||||||
global is_generic_target
|
global is_generic_target
|
||||||
|
global is_nvptx_target
|
||||||
|
global is_nvptx_nvvm
|
||||||
if is_windows:
|
if is_windows:
|
||||||
if is_generic_target:
|
if is_generic_target:
|
||||||
obj_name = "%s.cpp" % os.path.basename(filename)
|
obj_name = "%s.cpp" % os.path.basename(filename)
|
||||||
@@ -218,6 +220,13 @@ def run_test(testname):
|
|||||||
else:
|
else:
|
||||||
if is_generic_target:
|
if is_generic_target:
|
||||||
obj_name = "%s.cpp" % testname
|
obj_name = "%s.cpp" % testname
|
||||||
|
elif is_nvptx_target:
|
||||||
|
if os.environ.get("NVVM") == "1":
|
||||||
|
is_nvptx_nvvm = True
|
||||||
|
obj_name = "%s.bc" % testname
|
||||||
|
else:
|
||||||
|
obj_name = "%s.ptx" % testname
|
||||||
|
is_nvptx_nvvm = False
|
||||||
else:
|
else:
|
||||||
obj_name = "%s.o" % testname
|
obj_name = "%s.o" % testname
|
||||||
exe_name = "%s.run" % testname
|
exe_name = "%s.run" % testname
|
||||||
@@ -248,13 +257,32 @@ def run_test(testname):
|
|||||||
cc_cmd += ' -Wl,-no_pie'
|
cc_cmd += ' -Wl,-no_pie'
|
||||||
if should_fail:
|
if should_fail:
|
||||||
cc_cmd += " -DEXPECT_FAILURE"
|
cc_cmd += " -DEXPECT_FAILURE"
|
||||||
|
if is_nvptx_target:
|
||||||
|
nvptxcc_exe = "nvptxcc"
|
||||||
|
nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
|
||||||
|
cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
|
||||||
|
(nvptxcc_exe_rel, obj_name, match, exe_name)
|
||||||
|
|
||||||
ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
|
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
|
||||||
(filename, obj_name, options.arch, options.target)
|
(filename, obj_name, options.arch, options.target)
|
||||||
if options.no_opt:
|
if options.no_opt:
|
||||||
ispc_cmd += " -O0"
|
ispc_cmd += " -O0"
|
||||||
if is_generic_target:
|
if is_generic_target:
|
||||||
ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
|
ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
|
||||||
|
if is_nvptx_target:
|
||||||
|
filename4ptx = filename+".ptx.parsed_ispc"
|
||||||
|
grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
|
||||||
|
(filename, filename4ptx)
|
||||||
|
if options.verbose:
|
||||||
|
print "Grepping: %s" % grep_cmd
|
||||||
|
sp = subprocess.Popen(grep_cmd, shell=True)
|
||||||
|
sp.communicate()
|
||||||
|
if is_nvptx_nvvm:
|
||||||
|
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
|
||||||
|
(filename4ptx, obj_name, options.target)
|
||||||
|
else:
|
||||||
|
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
|
||||||
|
(filename4ptx, obj_name, options.target)
|
||||||
|
|
||||||
# compile the ispc code, make the executable, and run it...
|
# compile the ispc code, make the executable, and run it...
|
||||||
(compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd],
|
(compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd],
|
||||||
@@ -269,7 +297,7 @@ def run_test(testname):
|
|||||||
basename = os.path.basename(filename)
|
basename = os.path.basename(filename)
|
||||||
os.unlink("%s.pdb" % basename)
|
os.unlink("%s.pdb" % basename)
|
||||||
os.unlink("%s.ilk" % basename)
|
os.unlink("%s.ilk" % basename)
|
||||||
os.unlink(obj_name)
|
# os.unlink(obj_name)
|
||||||
except:
|
except:
|
||||||
None
|
None
|
||||||
|
|
||||||
@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
|
|||||||
ispc_exe = glob_var[3]
|
ispc_exe = glob_var[3]
|
||||||
global is_generic_target
|
global is_generic_target
|
||||||
is_generic_target = glob_var[4]
|
is_generic_target = glob_var[4]
|
||||||
|
global is_nvptx_target
|
||||||
global run_tests_log
|
global run_tests_log
|
||||||
run_tests_log = glob_var[5]
|
run_tests_log = glob_var[5]
|
||||||
|
|
||||||
@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
|
|||||||
|
|
||||||
if options.target == 'neon':
|
if options.target == 'neon':
|
||||||
options.arch = 'arm'
|
options.arch = 'arm'
|
||||||
|
if options.target == "nvptx":
|
||||||
|
options.arch = "nvptx64"
|
||||||
|
|
||||||
# use relative path to not depend on host directory, which may possibly
|
# use relative path to not depend on host directory, which may possibly
|
||||||
# have white spaces and unicode characters.
|
# have white spaces and unicode characters.
|
||||||
@@ -531,8 +562,10 @@ def run_tests(options1, args, print_version):
|
|||||||
ispc_exe += " " + options.ispc_flags
|
ispc_exe += " " + options.ispc_flags
|
||||||
|
|
||||||
global is_generic_target
|
global is_generic_target
|
||||||
|
global is_nvptx_target
|
||||||
is_generic_target = (options.target.find("generic-") != -1 and
|
is_generic_target = (options.target.find("generic-") != -1 and
|
||||||
options.target != "generic-1" and options.target != "generic-x1")
|
options.target != "generic-1" and options.target != "generic-x1")
|
||||||
|
is_nvptx_target = (options.target.find("nvptx") != -1)
|
||||||
if is_generic_target and options.include_file == None:
|
if is_generic_target and options.include_file == None:
|
||||||
if options.target == "generic-4" or options.target == "generic-x4":
|
if options.target == "generic-4" or options.target == "generic-x4":
|
||||||
error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
|
error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
|
||||||
|
|||||||
157
stdlib.ispc
157
stdlib.ispc
@@ -57,6 +57,31 @@
|
|||||||
#error Unknown value of ISPC_MASK_BITS
|
#error Unknown value of ISPC_MASK_BITS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
///////////////////////////////////////////////////////////////////////////
|
||||||
|
// CUDA Specific primitives
|
||||||
|
//
|
||||||
|
/***************/
|
||||||
|
|
||||||
|
__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
|
||||||
|
|
||||||
|
/***************/
|
||||||
|
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
|
||||||
|
|
||||||
|
/***************/
|
||||||
|
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
|
||||||
|
__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Low level primitives
|
// Low level primitives
|
||||||
|
|
||||||
@@ -464,6 +489,9 @@ __declspec(safe)
|
|||||||
static inline uniform int popcnt(bool v) {
|
static inline uniform int popcnt(bool v) {
|
||||||
// As with any() and all(), only count across the active lanes
|
// As with any() and all(), only count across the active lanes
|
||||||
#if (ISPC_MASK_BITS == 1)
|
#if (ISPC_MASK_BITS == 1)
|
||||||
|
if (__is_nvptx_target)
|
||||||
|
return __popcnt_int64(__movmsk_ptx(v & __mask));
|
||||||
|
else
|
||||||
return __popcnt_int64(__movmsk(v & __mask));
|
return __popcnt_int64(__movmsk(v & __mask));
|
||||||
#else
|
#else
|
||||||
return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
|
return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
|
||||||
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
|
|||||||
return __packed_store_active(a, vals, (IntMaskType)__mask);
|
return __packed_store_active(a, vals, (IntMaskType)__mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uniform int
|
||||||
|
packed_store_active(bool active, uniform int a[], int vals) {
|
||||||
|
return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
|
||||||
|
}
|
||||||
|
|
||||||
static inline uniform int
|
static inline uniform int
|
||||||
packed_store_active2(uniform int a[], int vals) {
|
packed_store_active2(uniform int a[], int vals) {
|
||||||
return __packed_store_active2(a, vals, (IntMaskType)__mask);
|
return __packed_store_active2(a, vals, (IntMaskType)__mask);
|
||||||
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
|
|||||||
// System information
|
// System information
|
||||||
|
|
||||||
static inline uniform int num_cores() {
|
static inline uniform int num_cores() {
|
||||||
|
if (__is_nvptx_target)
|
||||||
|
return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
|
||||||
|
else
|
||||||
return __num_cores();
|
return __num_cores();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
|
|||||||
__memory_barrier();
|
__memory_barrier();
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||||
return ret; \
|
return ret; \
|
||||||
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
|||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||||
|
return ret; \
|
||||||
|
} else { \
|
||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
@@ -1805,9 +1845,14 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
|||||||
} \
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
|
} \
|
||||||
|
|
||||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \
|
||||||
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||||
|
return ret; \
|
||||||
|
} else { \
|
||||||
uniform int i = 0; \
|
uniform int i = 0; \
|
||||||
TA ret[programCount]; \
|
TA ret[programCount]; \
|
||||||
TA memVal; \
|
TA memVal; \
|
||||||
@@ -1839,12 +1884,17 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
|||||||
ret[lastSwap] = memVal; \
|
ret[lastSwap] = memVal; \
|
||||||
return ret[programIndex]; \
|
return ret[programIndex]; \
|
||||||
}\
|
}\
|
||||||
|
} \
|
||||||
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||||
uniform TA value) { \
|
uniform TA value) { \
|
||||||
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
|
uniform TA ret = __atomic_swap_uniform_##TB##_global(ptr, value); \
|
||||||
return ret; \
|
return ret; \
|
||||||
} \
|
} \
|
||||||
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||||
|
return ret; \
|
||||||
|
} else { \
|
||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
@@ -1856,8 +1906,9 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
|||||||
} \
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
}\
|
}\
|
||||||
|
} \
|
||||||
|
|
||||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||||
uniform TA oneval = reduce_##OPA(value); \
|
uniform TA oneval = reduce_##OPA(value); \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
|||||||
} \
|
} \
|
||||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||||
TA value) { \
|
TA value) { \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||||
|
return ret; \
|
||||||
|
} else { \
|
||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
|||||||
ret = insert(ret, i, r); \
|
ret = insert(ret, i, r); \
|
||||||
} \
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_SWAP(int32,int32)
|
DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
|
||||||
|
|
||||||
// For everything but atomic min and max, we can use the same
|
// For everything but atomic min and max, we can use the same
|
||||||
// implementations for unsigned as for signed.
|
// implementations for unsigned as for signed.
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
|
||||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
|
DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
|
||||||
|
|
||||||
DEFINE_ATOMIC_SWAP(float,float)
|
DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
|
||||||
|
|
||||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
|
||||||
DEFINE_ATOMIC_SWAP(int64,int64)
|
DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
|
||||||
|
|
||||||
// For everything but atomic min and max, we can use the same
|
// For everything but atomic min and max, we can use the same
|
||||||
// implementations for unsigned as for signed.
|
// implementations for unsigned as for signed.
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
|
||||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
|
DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
|
||||||
|
|
||||||
DEFINE_ATOMIC_SWAP(double,double)
|
DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
|
||||||
|
|
||||||
#undef DEFINE_ATOMIC_OP
|
#undef DEFINE_ATOMIC_OP
|
||||||
#undef DEFINE_ATOMIC_MINMAX_OP
|
#undef DEFINE_ATOMIC_MINMAX_OP
|
||||||
#undef DEFINE_ATOMIC_SWAP
|
#undef DEFINE_ATOMIC_SWAP
|
||||||
|
|
||||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \
|
||||||
static inline uniform TA atomic_compare_exchange_global( \
|
static inline uniform TA atomic_compare_exchange_global( \
|
||||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||||
uniform TA ret = \
|
uniform TA ret = \
|
||||||
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global( \
|
|||||||
} \
|
} \
|
||||||
static inline TA atomic_compare_exchange_global( \
|
static inline TA atomic_compare_exchange_global( \
|
||||||
uniform TA * varying ptr, TA oldval, TA newval) { \
|
uniform TA * varying ptr, TA oldval, TA newval) { \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \
|
||||||
|
return ret; \
|
||||||
|
} else { \
|
||||||
uniform TA * uniform ptrArray[programCount]; \
|
uniform TA * uniform ptrArray[programCount]; \
|
||||||
ptrArray[programIndex] = ptr; \
|
ptrArray[programIndex] = ptr; \
|
||||||
TA ret; \
|
TA ret; \
|
||||||
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global( \
|
|||||||
ret = insert(ret, i, r); \
|
ret = insert(ret, i, r); \
|
||||||
} \
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
|
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
|
||||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
|
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
|
||||||
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
|
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
|
||||||
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
|
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
|
||||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
|
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
|
||||||
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
|
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
|
||||||
|
|
||||||
#undef ATOMIC_DECL_CMPXCHG
|
#undef ATOMIC_DECL_CMPXCHG
|
||||||
|
|
||||||
@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
|
|||||||
} \
|
} \
|
||||||
static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
|
static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
|
||||||
TYPE ret; \
|
TYPE ret; \
|
||||||
|
if (__is_nvptx_target) { \
|
||||||
|
foreach_active (i) { \
|
||||||
|
uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
|
||||||
|
ret = insert(ret, i, *ptr); \
|
||||||
|
*ptr = OPFUNC(*ptr, extract(value, i)); \
|
||||||
|
} \
|
||||||
|
} else { \
|
||||||
uniform TYPE * uniform ptrs[programCount]; \
|
uniform TYPE * uniform ptrs[programCount]; \
|
||||||
ptrs[programIndex] = p; \
|
ptrs[programIndex] = p; \
|
||||||
foreach_active (i) { \
|
foreach_active (i) { \
|
||||||
ret = insert(ret, i, *ptrs[i]); \
|
ret = insert(ret, i, *ptrs[i]); \
|
||||||
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
||||||
} \
|
} \
|
||||||
|
} \
|
||||||
return ret; \
|
return ret; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
293
stmt.cpp
293
stmt.cpp
@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
|
|||||||
return lHasUnsizedArrays(at->GetElementType());
|
return lHasUnsizedArrays(at->GetElementType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos, const bool variable = false)
|
||||||
|
{
|
||||||
|
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX)
|
||||||
|
return value;
|
||||||
|
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
|
||||||
|
const int addressSpace = pt->getAddressSpace();
|
||||||
|
if (addressSpace != 3 && addressSpace != 4)
|
||||||
|
return value;
|
||||||
|
|
||||||
|
llvm::Type *elTy = pt->getElementType();
|
||||||
|
|
||||||
|
/* convert elTy addrspace(3)* to i64* addrspace(3)* */
|
||||||
|
llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
|
||||||
|
value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
|
||||||
|
|
||||||
|
/* convert i64* addrspace(3) to i64* */
|
||||||
|
llvm::Function *__cvt2gen = m->module->getFunction(
|
||||||
|
addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
|
||||||
|
|
||||||
|
std::vector<llvm::Value *> __cvt2gen_args;
|
||||||
|
__cvt2gen_args.push_back(value);
|
||||||
|
value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
|
||||||
|
|
||||||
|
/* compute offset */
|
||||||
|
if (addressSpace == 3)
|
||||||
|
{
|
||||||
|
assert(elTy->isArrayTy());
|
||||||
|
const int numElTot = elTy->getArrayNumElements();
|
||||||
|
const int numEl = numElTot/4;
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
|
||||||
|
#endif
|
||||||
|
llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
|
||||||
|
assert(arrTy != NULL);
|
||||||
|
llvm::Type *arrElTy = arrTy->getElementType();
|
||||||
|
#if 0
|
||||||
|
if (arrElTy->isArrayTy())
|
||||||
|
Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* convert i64* to errElTy* */
|
||||||
|
llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
|
||||||
|
value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
|
||||||
|
|
||||||
|
llvm::Function *func_warp_index = m->module->getFunction("__warp_index");
|
||||||
|
llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(), "gep2gen_warp_index");
|
||||||
|
llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
|
||||||
|
value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* convert arrElTy* to elTy* */
|
||||||
|
llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
|
||||||
|
value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||||
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if (sym->storageClass == SC_STATIC) {
|
if (sym->storageClass == SC_STATIC) {
|
||||||
|
|
||||||
|
if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
|
||||||
|
PerformanceWarning(sym->pos,
|
||||||
|
"Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
|
||||||
|
sym->name.c_str());
|
||||||
|
if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
|
||||||
|
PerformanceWarning(sym->pos,
|
||||||
|
"\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
|
||||||
|
sym->name.c_str());
|
||||||
|
if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
|
||||||
|
PerformanceWarning(sym->pos,
|
||||||
|
"\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
|
||||||
|
sym->name.c_str());
|
||||||
|
|
||||||
// For static variables, we need a compile-time constant value
|
// For static variables, we need a compile-time constant value
|
||||||
// for its initializer; if there's no initializer, we use a
|
// for its initializer; if there's no initializer, we use a
|
||||||
// zero value.
|
// zero value.
|
||||||
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
if (cinit == NULL)
|
if (cinit == NULL)
|
||||||
cinit = llvm::Constant::getNullValue(llvmType);
|
cinit = llvm::Constant::getNullValue(llvmType);
|
||||||
|
|
||||||
|
int addressSpace = 0;
|
||||||
|
if (g->target->getISA() == Target::NVPTX &&
|
||||||
|
sym->type->IsConstType() &&
|
||||||
|
sym->type->IsUniformType())
|
||||||
|
addressSpace = 4;
|
||||||
|
|
||||||
// Allocate space for the static variable in global scope, so
|
// Allocate space for the static variable in global scope, so
|
||||||
// that it persists across function calls
|
// that it persists across function calls
|
||||||
sym->storagePtr =
|
sym->storagePtr =
|
||||||
new llvm::GlobalVariable(*m->module, llvmType,
|
new llvm::GlobalVariable(*m->module, llvmType,
|
||||||
sym->type->IsConstType(),
|
sym->type->IsConstType(),
|
||||||
llvm::GlobalValue::InternalLinkage, cinit,
|
llvm::GlobalValue::InternalLinkage, cinit,
|
||||||
llvm::Twine("static.") +
|
llvm::Twine("static_") +
|
||||||
llvm::Twine(sym->pos.first_line) +
|
llvm::Twine(sym->pos.first_line) +
|
||||||
llvm::Twine(".") + sym->name.c_str());
|
llvm::Twine("_") + sym->name.c_str(),
|
||||||
|
NULL,
|
||||||
|
llvm::GlobalVariable::NotThreadLocal,
|
||||||
|
addressSpace);
|
||||||
|
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
|
||||||
// Tell the FunctionEmitContext about the variable
|
// Tell the FunctionEmitContext about the variable
|
||||||
ctx->EmitVariableDebugInfo(sym);
|
ctx->EmitVariableDebugInfo(sym);
|
||||||
}
|
}
|
||||||
else {
|
else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
|
||||||
|
/* NVPTX:
|
||||||
|
* only non-constant uniform data types are stored in shared memory
|
||||||
|
* constant uniform are automatically promoted to varying
|
||||||
|
*/
|
||||||
|
!sym->type->IsConstType() &&
|
||||||
|
#if 1
|
||||||
|
sym->type->IsArrayType() &&
|
||||||
|
#endif
|
||||||
|
g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
PerformanceWarning(sym->pos,
|
||||||
|
"Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
|
||||||
|
"Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
|
||||||
|
|
||||||
|
/* with __shared__ memory everything must be an array */
|
||||||
|
int nel = 4;
|
||||||
|
ArrayType *nat;
|
||||||
|
bool variable = true;
|
||||||
|
if (sym->type->IsArrayType())
|
||||||
|
{
|
||||||
|
const ArrayType *at = CastType<ArrayType>(sym->type);
|
||||||
|
/* we must scale # elements by 4, because a thread-block will run 4 warps
|
||||||
|
* or 128 threads.
|
||||||
|
* ***note-to-me***:please define these value (128threads/4warps)
|
||||||
|
* in nvptx-target definition
|
||||||
|
* instead of compile-time constants
|
||||||
|
*/
|
||||||
|
nel *= at->GetElementCount();
|
||||||
|
if (sym->type->IsSOAType())
|
||||||
|
nel *= sym->type->GetSOAWidth();
|
||||||
|
nat = new ArrayType(at->GetElementType(), nel);
|
||||||
|
variable = false;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
nat = new ArrayType(sym->type, nel);
|
||||||
|
|
||||||
|
llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
|
||||||
|
llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
|
||||||
|
|
||||||
|
sym->storagePtr =
|
||||||
|
new llvm::GlobalVariable(*m->module, llvmTypeUn,
|
||||||
|
sym->type->IsConstType(),
|
||||||
|
llvm::GlobalValue::InternalLinkage,
|
||||||
|
cinit,
|
||||||
|
llvm::Twine("local_") +
|
||||||
|
llvm::Twine(sym->pos.first_line) +
|
||||||
|
llvm::Twine("_") + sym->name.c_str(),
|
||||||
|
NULL,
|
||||||
|
llvm::GlobalVariable::NotThreadLocal,
|
||||||
|
/*AddressSpace=*/3);
|
||||||
|
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
|
||||||
|
llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
|
||||||
|
sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
|
||||||
|
|
||||||
|
// Tell the FunctionEmitContext about the variable; must do
|
||||||
|
// this before the initializer stuff.
|
||||||
|
ctx->EmitVariableDebugInfo(sym);
|
||||||
|
|
||||||
|
if (initExpr == 0 && sym->type->IsConstType())
|
||||||
|
Error(sym->pos, "Missing initializer for const variable "
|
||||||
|
"\"%s\".", sym->name.c_str());
|
||||||
|
|
||||||
|
// And then get it initialized...
|
||||||
|
sym->parentFunction = ctx->GetFunction();
|
||||||
|
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
// For non-static variables, allocate storage on the stack
|
// For non-static variables, allocate storage on the stack
|
||||||
sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
|
sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
|
||||||
|
|
||||||
@@ -253,6 +402,10 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
// this before the initializer stuff.
|
// this before the initializer stuff.
|
||||||
ctx->EmitVariableDebugInfo(sym);
|
ctx->EmitVariableDebugInfo(sym);
|
||||||
|
|
||||||
|
if (initExpr == 0 && sym->type->IsConstType())
|
||||||
|
Error(sym->pos, "Missing initializer for const variable "
|
||||||
|
"\"%s\".", sym->name.c_str());
|
||||||
|
|
||||||
// And then get it initialized...
|
// And then get it initialized...
|
||||||
sym->parentFunction = ctx->GetFunction();
|
sym->parentFunction = ctx->GetFunction();
|
||||||
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
|
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
|
||||||
@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
if (testValue == NULL)
|
if (testValue == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (!isUniform && g->target->getISA() == Target::NVPTX)
|
||||||
|
{
|
||||||
|
/* With "nvptx" target, SIMT hardware takes care of non-uniform
|
||||||
|
* control flow. We trick ISPC to generate uniform control flow.
|
||||||
|
*/
|
||||||
|
testValue = ctx->ExtractInst(testValue, 0);
|
||||||
|
isUniform = true;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
if (isUniform) {
|
if (isUniform) {
|
||||||
ctx->StartUniformIf();
|
ctx->StartUniformIf();
|
||||||
if (doAllCheck)
|
if (doAllCheck)
|
||||||
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
|||||||
|
|
||||||
// Do any of the program instances want to run the 'true'
|
// Do any of the program instances want to run the 'true'
|
||||||
// block? If not, jump ahead to bNext.
|
// block? If not, jump ahead to bNext.
|
||||||
|
#if 1
|
||||||
llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
|
llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
|
||||||
|
#else
|
||||||
|
llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
|
||||||
|
#endif
|
||||||
ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
|
ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
|
||||||
|
|
||||||
// Emit statements for true
|
// Emit statements for true
|
||||||
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
|||||||
|
|
||||||
// Similarly, check to see if any of the instances want to
|
// Similarly, check to see if any of the instances want to
|
||||||
// run the 'false' block...
|
// run the 'false' block...
|
||||||
|
#if 1
|
||||||
llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
|
llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
|
||||||
|
#else
|
||||||
|
llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
|
||||||
|
#endif
|
||||||
ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
|
ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
|
||||||
|
|
||||||
// Emit code for false
|
// Emit code for false
|
||||||
@@ -1273,7 +1447,10 @@ static llvm::Value *
|
|||||||
lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
||||||
llvm::Value *uniformCounterPtr,
|
llvm::Value *uniformCounterPtr,
|
||||||
llvm::Value *varyingCounterPtr,
|
llvm::Value *varyingCounterPtr,
|
||||||
const std::vector<int> &spans) {
|
const std::vector<int> &spans)
|
||||||
|
{
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
// Smear the uniform counter value out to be varying
|
// Smear the uniform counter value out to be varying
|
||||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
|
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
|
||||||
llvm::Value *smearCounter = ctx->BroadcastValue(
|
llvm::Value *smearCounter = ctx->BroadcastValue(
|
||||||
@@ -1307,6 +1484,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
|||||||
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
||||||
return varyingCounter;
|
return varyingCounter;
|
||||||
}
|
}
|
||||||
|
else /* NVPTX == true */
|
||||||
|
{
|
||||||
|
// Smear the uniform counter value out to be varying
|
||||||
|
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
|
||||||
|
llvm::Value *smearCounter = ctx->BroadcastValue(
|
||||||
|
counter, LLVMTypes::Int32VectorType, "smear_counter");
|
||||||
|
|
||||||
|
// Figure out the offsets; this is a little bit tricky. As an example,
|
||||||
|
// consider a 2D tiled foreach loop, where we're running 8-wide and
|
||||||
|
// where the inner dimension has a stride of 4 and the outer dimension
|
||||||
|
// has a stride of 2. For the inner dimension, we want the offsets
|
||||||
|
// (0,1,2,3,0,1,2,3), and for the outer dimension we want
|
||||||
|
// (0,0,0,0,1,1,1,1).
|
||||||
|
int32_t delta[ISPC_MAX_NVEC];
|
||||||
|
const int vecWidth = 32;
|
||||||
|
std::vector<llvm::Constant*> constDeltaList;
|
||||||
|
for (int i = 0; i < vecWidth; ++i)
|
||||||
|
{
|
||||||
|
int d = i;
|
||||||
|
// First, account for the effect of any dimensions at deeper
|
||||||
|
// nesting levels than the current one.
|
||||||
|
int prevDimSpanCount = 1;
|
||||||
|
for (int j = dim; j < nDims-1; ++j)
|
||||||
|
prevDimSpanCount *= spans[j+1];
|
||||||
|
d /= prevDimSpanCount;
|
||||||
|
|
||||||
|
// And now with what's left, figure out our own offset
|
||||||
|
delta[i] = d % spans[dim];
|
||||||
|
constDeltaList.push_back(LLVMInt8(delta[i]));
|
||||||
|
}
|
||||||
|
|
||||||
|
llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
|
||||||
|
// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
|
||||||
|
|
||||||
|
|
||||||
|
llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
|
||||||
|
/*Module=*/*m->module,
|
||||||
|
/*Type=*/ArrayDelta,
|
||||||
|
/*isConstant=*/true,
|
||||||
|
/*Linkage=*/llvm::GlobalValue::PrivateLinkage,
|
||||||
|
/*Initializer=*/0, // has initializer, specified below
|
||||||
|
/*Name=*/"constDeltaForeach");
|
||||||
|
#if 0
|
||||||
|
/*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
|
||||||
|
/*unsigned AddressSpace=*/4 /*constant*/);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
|
||||||
|
|
||||||
|
globalDelta->setInitializer(constDelta);
|
||||||
|
llvm::Function *func_program_index = m->module->getFunction("__program_index");
|
||||||
|
llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
|
||||||
|
|
||||||
|
std::vector<llvm::Value*> ptr_arrayidx_indices;
|
||||||
|
ptr_arrayidx_indices.push_back(LLVMInt32(0));
|
||||||
|
ptr_arrayidx_indices.push_back(laneIdx);
|
||||||
|
#if 1
|
||||||
|
llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
|
||||||
|
llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
|
||||||
|
llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
|
||||||
|
|
||||||
|
llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
|
||||||
|
llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
|
||||||
|
|
||||||
|
llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
|
||||||
|
// llvm::UndefValue(LLVMInt32Vector),
|
||||||
|
const_packed_41,
|
||||||
|
int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
// Add the deltas to compute the varying counter values; store the
|
||||||
|
// result to memory and then return it directly as well.
|
||||||
|
#if 0
|
||||||
|
llvm::Value *varyingCounter =
|
||||||
|
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||||
|
LLVMInt32Vector(delta), "iter_val");
|
||||||
|
#else
|
||||||
|
llvm::Value *varyingCounter =
|
||||||
|
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||||
|
packed_43, "iter_val");
|
||||||
|
#endif
|
||||||
|
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
||||||
|
return varyingCounter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Returns the integer log2 of the given integer. */
|
/** Returns the integer log2 of the given integer. */
|
||||||
@@ -1394,7 +1658,9 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
|
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
|
||||||
|
|
||||||
std::vector<int> span(nDims, 0);
|
std::vector<int> span(nDims, 0);
|
||||||
lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
|
const int vectorWidth =
|
||||||
|
g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
|
||||||
|
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
|
||||||
|
|
||||||
for (int i = 0; i < nDims; ++i) {
|
for (int i = 0; i < nDims; ++i) {
|
||||||
// Basic blocks that we'll fill in later with the looping logic for
|
// Basic blocks that we'll fill in later with the looping logic for
|
||||||
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
// math...)
|
// math...)
|
||||||
|
|
||||||
// Get the "program index" vector value
|
// Get the "program index" vector value
|
||||||
llvm::Value *programIndex = ctx->ProgramIndexVector();
|
llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
|
||||||
|
ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
|
||||||
|
|
||||||
// And smear the current lane out to a vector
|
// And smear the current lane out to a vector
|
||||||
llvm::Value *firstSet32 =
|
llvm::Value *firstSet32 =
|
||||||
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
|
|
||||||
// And load the corresponding element value from the temporary
|
// And load the corresponding element value from the temporary
|
||||||
// memory storing the value of the varying expr.
|
// memory storing the value of the varying expr.
|
||||||
|
llvm::Value *uniqueValue;
|
||||||
|
if (g->target->getISA() != Target::NVPTX)
|
||||||
|
{
|
||||||
llvm::Value *uniqueValuePtr =
|
llvm::Value *uniqueValuePtr =
|
||||||
ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
|
ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
|
||||||
"unique_index_ptr");
|
"unique_index_ptr");
|
||||||
llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
|
uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
|
||||||
|
}
|
||||||
|
else /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
|
||||||
|
{
|
||||||
|
llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
|
||||||
|
uniqueValue = ctx->Extract(exprValue, firstSet32);
|
||||||
|
}
|
||||||
|
|
||||||
// If it's a varying pointer type, need to convert from the int
|
// If it's a varying pointer type, need to convert from the int
|
||||||
// type we store in the vector to the actual pointer type
|
// type we store in the vector to the actual pointer type
|
||||||
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Now we can emit code to call __do_print()
|
// Now we can emit code to call __do_print()
|
||||||
llvm::Function *printFunc = m->module->getFunction("__do_print");
|
llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
|
||||||
|
m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
|
||||||
AssertPos(pos, printFunc);
|
AssertPos(pos, printFunc);
|
||||||
|
|
||||||
llvm::Value *mask = ctx->GetFullMask();
|
llvm::Value *mask = ctx->GetFullMask();
|
||||||
|
|||||||
440
test_static_cuda.cpp
Normal file
440
test_static_cuda.cpp
Normal file
@@ -0,0 +1,440 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
#include <windows.h>
|
||||||
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstdint>
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/******************************/
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include "drvapi_error_string.h"
|
||||||
|
#include "ispc_malloc.h"
|
||||||
|
|
||||||
|
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||||
|
// These are the inline versions for all of the SDK helper functions
|
||||||
|
void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||||
|
if(CUDA_SUCCESS != err) {
|
||||||
|
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
|
||||||
|
<< getCudaDrvErrorString(err) << "\" from file <" << file
|
||||||
|
<< ", line " << line << "\n";
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/******************************/
|
||||||
|
/**** Basic CUDriver API ****/
|
||||||
|
/******************************/
|
||||||
|
|
||||||
|
CUcontext context;
|
||||||
|
|
||||||
|
static void createContext(const int deviceId = 0, const bool verbose = true)
|
||||||
|
{
|
||||||
|
CUdevice device;
|
||||||
|
int devCount;
|
||||||
|
checkCudaErrors(cuInit(0));
|
||||||
|
checkCudaErrors(cuDeviceGetCount(&devCount));
|
||||||
|
assert(devCount > 0);
|
||||||
|
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
|
||||||
|
|
||||||
|
char name[128];
|
||||||
|
checkCudaErrors(cuDeviceGetName(name, 128, device));
|
||||||
|
if (verbose)
|
||||||
|
std::cout << "Using CUDA Device [0]: " << name << "\n";
|
||||||
|
|
||||||
|
int devMajor, devMinor;
|
||||||
|
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
|
||||||
|
if (verbose)
|
||||||
|
std::cout << "Device Compute Capability: "
|
||||||
|
<< devMajor << "." << devMinor << "\n";
|
||||||
|
if (devMajor < 2) {
|
||||||
|
if (verbose)
|
||||||
|
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create driver context
|
||||||
|
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||||
|
}
|
||||||
|
static void destroyContext()
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuCtxDestroy(context));
|
||||||
|
}
|
||||||
|
|
||||||
|
static CUmodule loadModule(
|
||||||
|
const char * module,
|
||||||
|
const int maxrregcount = 64,
|
||||||
|
const char cudadevrt_lib[] = "libcudadevrt.a",
|
||||||
|
const size_t log_size = 32768,
|
||||||
|
const bool print_log = true
|
||||||
|
)
|
||||||
|
{
|
||||||
|
CUmodule cudaModule;
|
||||||
|
// in this branch we use compilation with parameters
|
||||||
|
|
||||||
|
CUlinkState CUState;
|
||||||
|
CUlinkState *lState = &CUState;
|
||||||
|
const int nOptions = 8;
|
||||||
|
CUjit_option options[nOptions];
|
||||||
|
void* optionVals[nOptions];
|
||||||
|
float walltime;
|
||||||
|
size_t logSize = log_size;
|
||||||
|
char error_log[logSize],
|
||||||
|
info_log[logSize];
|
||||||
|
void *cuOut;
|
||||||
|
size_t outSize;
|
||||||
|
int myErr = 0;
|
||||||
|
|
||||||
|
// Setup linker options
|
||||||
|
// Return walltime from JIT compilation
|
||||||
|
options[0] = CU_JIT_WALL_TIME;
|
||||||
|
optionVals[0] = (void*) &walltime;
|
||||||
|
// Pass a buffer for info messages
|
||||||
|
options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||||
|
optionVals[1] = (void*) info_log;
|
||||||
|
// Pass the size of the info buffer
|
||||||
|
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
optionVals[2] = (void*) logSize;
|
||||||
|
// Pass a buffer for error message
|
||||||
|
options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
||||||
|
optionVals[3] = (void*) error_log;
|
||||||
|
// Pass the size of the error buffer
|
||||||
|
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
||||||
|
optionVals[4] = (void*) logSize;
|
||||||
|
// Make the linker verbose
|
||||||
|
options[5] = CU_JIT_LOG_VERBOSE;
|
||||||
|
optionVals[5] = (void*) 1;
|
||||||
|
// Max # of registers/pthread
|
||||||
|
options[6] = CU_JIT_MAX_REGISTERS;
|
||||||
|
int jitRegCount = maxrregcount;
|
||||||
|
optionVals[6] = (void *)(size_t)jitRegCount;
|
||||||
|
// Caching
|
||||||
|
options[7] = CU_JIT_CACHE_MODE;
|
||||||
|
optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
|
||||||
|
// Create a pending linker invocation
|
||||||
|
|
||||||
|
// Create a pending linker invocation
|
||||||
|
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (sizeof(void *)==4)
|
||||||
|
{
|
||||||
|
// Load the PTX from the string myPtx32
|
||||||
|
printf("Loading myPtx32[] program\n");
|
||||||
|
// PTX May also be loaded from file, as per below.
|
||||||
|
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
// Load the PTX from the string myPtx (64-bit)
|
||||||
|
if (print_log)
|
||||||
|
fprintf(stderr, "Loading ptx..\n");
|
||||||
|
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
|
||||||
|
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0);
|
||||||
|
// PTX May also be loaded from file, as per below.
|
||||||
|
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Complete the linker step
|
||||||
|
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
|
||||||
|
|
||||||
|
if ( myErr != CUDA_SUCCESS )
|
||||||
|
{
|
||||||
|
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
|
||||||
|
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linker walltime and info_log were requested in options above.
|
||||||
|
if (print_log)
|
||||||
|
fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
|
||||||
|
|
||||||
|
// Load resulting cuBin into module
|
||||||
|
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
|
||||||
|
|
||||||
|
// Destroy the linker invocation
|
||||||
|
checkCudaErrors(cuLinkDestroy(*lState));
|
||||||
|
return cudaModule;
|
||||||
|
}
|
||||||
|
static void unloadModule(CUmodule &cudaModule)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuModuleUnload(cudaModule));
|
||||||
|
}
|
||||||
|
|
||||||
|
static CUfunction getFunction(CUmodule &cudaModule, const char * function)
|
||||||
|
{
|
||||||
|
CUfunction cudaFunction;
|
||||||
|
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
|
||||||
|
return cudaFunction;
|
||||||
|
}
|
||||||
|
|
||||||
|
static CUdeviceptr deviceMalloc(const size_t size)
|
||||||
|
{
|
||||||
|
CUdeviceptr d_buf;
|
||||||
|
checkCudaErrors(cuMemAlloc(&d_buf, size));
|
||||||
|
return d_buf;
|
||||||
|
}
|
||||||
|
static void deviceFree(CUdeviceptr d_buf)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuMemFree(d_buf));
|
||||||
|
}
|
||||||
|
static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
|
||||||
|
}
|
||||||
|
static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||||
|
}
|
||||||
|
#define deviceLaunch(func,params) \
|
||||||
|
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
|
||||||
|
checkCudaErrors( \
|
||||||
|
cuLaunchKernel( \
|
||||||
|
(func), \
|
||||||
|
1,1,1, \
|
||||||
|
32, 1, 1, \
|
||||||
|
0, NULL, (params), NULL \
|
||||||
|
));
|
||||||
|
|
||||||
|
typedef CUdeviceptr devicePtr;
|
||||||
|
|
||||||
|
|
||||||
|
/**************/
|
||||||
|
#include <vector>
|
||||||
|
static std::vector<char> readBinary(const char * filename, const bool print_size = false)
|
||||||
|
{
|
||||||
|
std::vector<char> buffer;
|
||||||
|
FILE *fp = fopen(filename, "rb");
|
||||||
|
if (!fp )
|
||||||
|
{
|
||||||
|
fprintf(stderr, "file %s not found\n", filename);
|
||||||
|
assert(0);
|
||||||
|
}
|
||||||
|
fseek(fp, 0, SEEK_END);
|
||||||
|
const unsigned long long size = ftell(fp); /*calc the size needed*/
|
||||||
|
fseek(fp, 0, SEEK_SET);
|
||||||
|
buffer.resize(size);
|
||||||
|
|
||||||
|
if (fp == NULL){ /*ERROR detection if file == empty*/
|
||||||
|
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
|
||||||
|
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (print_size)
|
||||||
|
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
static double CUDALaunch(
|
||||||
|
void **handlePtr,
|
||||||
|
const char * func_name,
|
||||||
|
void **func_args,
|
||||||
|
const bool print_log = true,
|
||||||
|
const int maxrregcount = 64,
|
||||||
|
const char kernel_file[] = "__kernels.ptx",
|
||||||
|
const char cudadevrt_lib[] = "libcudadevrt.a",
|
||||||
|
const int log_size = 32768)
|
||||||
|
{
|
||||||
|
fprintf(stderr, " launching kernel: %s \n", func_name);
|
||||||
|
const std::vector<char> module_str = readBinary(kernel_file, print_log);
|
||||||
|
const char * module = &module_str[0];
|
||||||
|
CUmodule cudaModule = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
|
||||||
|
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||||
|
deviceLaunch(cudaFunction, func_args);
|
||||||
|
checkCudaErrors(cuStreamSynchronize(0));
|
||||||
|
unloadModule(cudaModule);
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
/******************************/
|
||||||
|
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
// extern int width();
|
||||||
|
int width() { return 32; }
|
||||||
|
extern void f_v(float *result);
|
||||||
|
extern void f_f(float *result, float *a);
|
||||||
|
extern void f_fu(float *result, float *a, float b);
|
||||||
|
extern void f_fi(float *result, float *a, int *b);
|
||||||
|
extern void f_du(float *result, double *a, double b);
|
||||||
|
extern void f_duf(float *result, double *a, float b);
|
||||||
|
extern void f_di(float *result, double *a, int *b);
|
||||||
|
extern void result(float *val);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ALIGN
|
||||||
|
#else
|
||||||
|
#define ALIGN __attribute__((aligned(64)))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int w = width();
|
||||||
|
assert(w <= 64);
|
||||||
|
|
||||||
|
float returned_result[64] ALIGN;
|
||||||
|
float vfloat[64] ALIGN;
|
||||||
|
double vdouble[64] ALIGN;
|
||||||
|
int vint[64] ALIGN;
|
||||||
|
int vint2[64] ALIGN;
|
||||||
|
|
||||||
|
const int device = 0;
|
||||||
|
#if 0
|
||||||
|
const bool verbose = true;
|
||||||
|
#else
|
||||||
|
const bool verbose = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*******************/
|
||||||
|
createContext(device, verbose);
|
||||||
|
/*******************/
|
||||||
|
|
||||||
|
devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
|
||||||
|
devicePtr d_vfloat = deviceMalloc(64*sizeof(float));
|
||||||
|
devicePtr d_vdouble = deviceMalloc(64*sizeof(double));
|
||||||
|
devicePtr d_vint = deviceMalloc(64*sizeof(int));
|
||||||
|
devicePtr d_vint2 = deviceMalloc(64*sizeof(int));
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i < 64; ++i) {
|
||||||
|
returned_result[i] = -1e20;
|
||||||
|
vfloat[i] = i+1;
|
||||||
|
vdouble[i] = i+1;
|
||||||
|
vint[i] = 2*(i+1);
|
||||||
|
vint2[i] = i+5;
|
||||||
|
}
|
||||||
|
|
||||||
|
memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
|
||||||
|
memcpyH2D(d_vfloat , vfloat, 64*sizeof(float));
|
||||||
|
memcpyH2D(d_vdouble , vdouble, 64*sizeof(double));
|
||||||
|
memcpyH2D(d_vint , vint, 64*sizeof(int));
|
||||||
|
memcpyH2D(d_vint2 , vint2, 64*sizeof(int));
|
||||||
|
|
||||||
|
|
||||||
|
float b = 5.;
|
||||||
|
|
||||||
|
const bool print_log = false;
|
||||||
|
const int nreg = 64;
|
||||||
|
#if (TEST_SIG == 0)
|
||||||
|
void *args[] = {&d_returned_result};
|
||||||
|
CUDALaunch(NULL, "f_v", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 1)
|
||||||
|
void *args[] = {&d_returned_result, &d_vfloat};
|
||||||
|
CUDALaunch(NULL, "f_f", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 2)
|
||||||
|
void *args[] = {&d_returned_result, &d_vfloat, &b};
|
||||||
|
CUDALaunch(NULL, "f_fu", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 3)
|
||||||
|
void *args[] = {&d_returned_result, &d_vfloat, &vint};
|
||||||
|
CUDALaunch(NULL, "f_fi", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 4)
|
||||||
|
int num = 5;
|
||||||
|
void *args[] = {&d_returned_result, &d_vdouble, &num};
|
||||||
|
CUDALaunch(NULL, "f_du", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 5)
|
||||||
|
float num = 5.0f;
|
||||||
|
void *args[] = {&d_returned_result, &d_vdouble, &num};
|
||||||
|
CUDALaunch(NULL, "f_duf", args, print_log, nreg);
|
||||||
|
#elif (TEST_SIG == 6)
|
||||||
|
void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
|
||||||
|
CUDALaunch(NULL, "f_di", args, print_log, nreg);
|
||||||
|
#else
|
||||||
|
#error "Unknown or unset TEST_SIG value"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float expected_result[64];
|
||||||
|
|
||||||
|
memset(expected_result, 0, 64*sizeof(float));
|
||||||
|
devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
|
||||||
|
memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
|
||||||
|
void *res_args[] = {&d_expected_result};
|
||||||
|
CUDALaunch(NULL, "result", res_args, print_log, nreg);
|
||||||
|
memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
|
||||||
|
memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
|
||||||
|
|
||||||
|
deviceFree(d_returned_result);
|
||||||
|
deviceFree(d_vfloat);
|
||||||
|
deviceFree(d_vdouble);
|
||||||
|
deviceFree(d_vint);
|
||||||
|
deviceFree(d_vint2);
|
||||||
|
deviceFree(d_expected_result);
|
||||||
|
|
||||||
|
/*******************/
|
||||||
|
destroyContext();
|
||||||
|
/*******************/
|
||||||
|
|
||||||
|
int errors = 0;
|
||||||
|
for (int i = 0; i < w; ++i) {
|
||||||
|
if (returned_result[i] != expected_result[i]) {
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// bingo, failed
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||||
|
argv[0], i, returned_result[i], returned_result[i],
|
||||||
|
expected_result[i], expected_result[i]);
|
||||||
|
++errors;
|
||||||
|
#endif // EXPECT_FAILURE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// Don't expect to get here
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
return errors > 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
133
test_static_nvptx.cpp
Normal file
133
test_static_nvptx.cpp
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
/*
|
||||||
|
Copyright (c) 2010-2011, Intel Corporation
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the distribution.
|
||||||
|
|
||||||
|
* Neither the name of Intel Corporation nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived from
|
||||||
|
this software without specific prior written permission.
|
||||||
|
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||||
|
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||||
|
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||||
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
|
#define ISPC_IS_WINDOWS
|
||||||
|
#elif defined(__linux__)
|
||||||
|
#define ISPC_IS_LINUX
|
||||||
|
#elif defined(__APPLE__)
|
||||||
|
#define ISPC_IS_APPLE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef ISPC_IS_WINDOWS
|
||||||
|
#include <windows.h>
|
||||||
|
#endif // ISPC_IS_WINDOWS
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstring>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifdef ISPC_IS_LINUX
|
||||||
|
#include <malloc.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "ispc_malloc.h"
|
||||||
|
|
||||||
|
#define N 32
|
||||||
|
extern "C" {
|
||||||
|
int width() { return N; }
|
||||||
|
extern void f_v(float *result);
|
||||||
|
extern void f_f(float *result, float *a);
|
||||||
|
extern void f_fu(float *result, float *a, float b);
|
||||||
|
extern void f_fi(float *result, float *a, int *b);
|
||||||
|
extern void f_du(float *result, double *a, double b);
|
||||||
|
extern void f_duf(float *result, double *a, float b);
|
||||||
|
extern void f_di(float *result, double *a, int *b);
|
||||||
|
extern void result(float *val);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
int w = width();
|
||||||
|
assert(w <= N);
|
||||||
|
|
||||||
|
float *returned_result = new float[N*4];
|
||||||
|
float *vfloat = new float[N*4];
|
||||||
|
double *vdouble = new double[N*4];
|
||||||
|
int *vint = new int[N*4];
|
||||||
|
int *vint2 = new int[N*4];
|
||||||
|
|
||||||
|
for (int i = 0; i < N*4; ++i) {
|
||||||
|
returned_result[i] = -1e20;
|
||||||
|
vfloat[i] = i+1;
|
||||||
|
vdouble[i] = i+1;
|
||||||
|
vint[i] = 2*(i+1);
|
||||||
|
vint2[i] = i+5;
|
||||||
|
}
|
||||||
|
|
||||||
|
float b = 5.;
|
||||||
|
|
||||||
|
#if (TEST_SIG == 0)
|
||||||
|
f_v(returned_result);
|
||||||
|
#elif (TEST_SIG == 1)
|
||||||
|
f_f(returned_result, vfloat);
|
||||||
|
#elif (TEST_SIG == 2)
|
||||||
|
f_fu(returned_result, vfloat, b);
|
||||||
|
#elif (TEST_SIG == 3)
|
||||||
|
f_fi(returned_result, vfloat, vint);
|
||||||
|
#elif (TEST_SIG == 4)
|
||||||
|
f_du(returned_result, vdouble, 5.);
|
||||||
|
#elif (TEST_SIG == 5)
|
||||||
|
f_duf(returned_result, vdouble, 5.f);
|
||||||
|
#elif (TEST_SIG == 6)
|
||||||
|
f_di(returned_result, vdouble, vint2);
|
||||||
|
#else
|
||||||
|
#error "Unknown or unset TEST_SIG value"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
float *expected_result = new float[N];
|
||||||
|
memset(expected_result, 0, N*sizeof(float));
|
||||||
|
result(expected_result);
|
||||||
|
|
||||||
|
int errors = 0;
|
||||||
|
for (int i = 0; i < w; ++i) {
|
||||||
|
if (returned_result[i] != expected_result[i])
|
||||||
|
{
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// bingo, failed
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||||
|
argv[0], i, returned_result[i], returned_result[i],
|
||||||
|
expected_result[i], expected_result[i]);
|
||||||
|
++errors;
|
||||||
|
#endif // EXPECT_FAILURE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef EXPECT_FAILURE
|
||||||
|
// Don't expect to get here
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
return errors > 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
assert(programCount <= 64);
|
assert(programCount <= 64);
|
||||||
|
#ifdef __NVPTX__
|
||||||
|
uniform float * uniform xarr = uniform new uniform float[70*70];
|
||||||
|
uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
|
||||||
|
#define _SHMALLOC
|
||||||
|
#else
|
||||||
uniform float x[70][70];
|
uniform float x[70][70];
|
||||||
|
#endif
|
||||||
for (uniform int i = 0; i < 70; ++i)
|
for (uniform int i = 0; i < 70; ++i)
|
||||||
for (uniform int j = 0; j < 70; ++j)
|
for (uniform int j = 0; j < 70; ++j)
|
||||||
x[i][j] = 2+b-5;
|
x[i][j] = 2+b-5;
|
||||||
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
else
|
else
|
||||||
x[b-1][a-1] = 1;
|
x[b-1][a-1] = 1;
|
||||||
RET[programIndex] = x[4][a];
|
RET[programIndex] = x[4][a];
|
||||||
|
|
||||||
|
#ifdef _SHMALLOC
|
||||||
|
delete xarr;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
float b = (programCount == 1) ? 3 : broadcast(a, 2);
|
float b = (programCount == 1) ? 4 : broadcast(a, 2);
|
||||||
RET[programIndex] = b;
|
RET[programIndex] = b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 2;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 5;
|
RET[i+0] = 2;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 6;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 5;
|
||||||
|
RET[i+3] = 6;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
|
|||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[programIndex] = 3;
|
RET[programIndex] = 3;
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
{
|
||||||
|
RET[i+0] = 1;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[programIndex] = 32;
|
RET[programIndex] = 32;
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 38;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 39;
|
{
|
||||||
|
RET[i+2] = 38;
|
||||||
|
RET[i+3] = 39;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
|
|
||||||
struct Foo {
|
struct Foo {
|
||||||
uniform float x[17];
|
uniform float x[programCount+1];
|
||||||
};
|
};
|
||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform Foo foo;
|
uniform Foo foo;
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i)
|
cfor (i = 0; i < programCount+1; ++i)
|
||||||
foo.x[i] = i;
|
foo.x[i] = i;
|
||||||
|
|
||||||
if ((int)a & 1)
|
if ((int)a & 1)
|
||||||
|
|||||||
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform double udx[25][25];
|
uniform double udx[programCount+1][programCount+1];
|
||||||
cfor (uniform int i = 0; i < 25; ++i)
|
cfor (uniform int i = 0; i < programCount+1; ++i)
|
||||||
cfor (uniform int j = 0; j < 25; ++j)
|
cfor (uniform int j = 0; j < programCount+1; ++j)
|
||||||
udx[i][j] = 10*i+j;
|
udx[i][j] = 10*i+j;
|
||||||
|
|
||||||
int x = 1;
|
int x = 1;
|
||||||
|
|||||||
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform float udx[20][20];
|
uniform float udx[programCount+1][programCount+1];
|
||||||
cfor (uniform int i = 0; i < 20; ++i)
|
cfor (uniform int i = 0; i < programCount+1; ++i)
|
||||||
cfor (uniform int j = 0; j < 20; ++j)
|
cfor (uniform int j = 0; j < programCount+1; ++j)
|
||||||
udx[i][j] = 100*i+j;
|
udx[i][j] = 100*i+j;
|
||||||
|
|
||||||
int x = 1;
|
int x = 1;
|
||||||
|
|||||||
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
|
|
||||||
struct Foo {
|
struct Foo {
|
||||||
uniform float udx[25][25];
|
uniform float udx[32][32];
|
||||||
};
|
};
|
||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
|
#ifndef __NVPTX__
|
||||||
uniform Foo f[5];
|
uniform Foo f[5];
|
||||||
|
#else /* too much shared memory allocated, nvcc fails to link */
|
||||||
|
uniform Foo * uniform f = uniform new uniform Foo[5];
|
||||||
|
#define _UNMALLOC
|
||||||
|
#endif
|
||||||
cfor (uniform int i = 0; i < 5; ++i)
|
cfor (uniform int i = 0; i < 5; ++i)
|
||||||
cfor (uniform int j = 0; j < 25; ++j)
|
cfor (uniform int j = 0; j < 32; ++j)
|
||||||
cfor (uniform int k = 0; k < 25; ++k)
|
cfor (uniform int k = 0; k < 32; ++k)
|
||||||
f[i].udx[j][k] = 1000*i+100*j+k;
|
f[i].udx[j][k] = 1000*i+100*j+k;
|
||||||
|
|
||||||
int x = 1;
|
int x = 1;
|
||||||
RET[programIndex] = f[x+1].udx[b-4][programIndex];
|
RET[programIndex] = f[x+1].udx[b-4][programIndex];
|
||||||
|
#ifdef _UNMALLOC
|
||||||
|
delete f;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
|
export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
|
|||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
Foo foo[17];
|
Foo foo[programCount+1];
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i)
|
cfor (i = 0; i < programCount+1; ++i)
|
||||||
foo[i].f = i*a;
|
foo[i].f = i*a;
|
||||||
RET[programIndex] = func(foo, (int)a);
|
RET[programIndex] = func(foo, (int)a);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
|
|||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
Foo foo[17];
|
Foo foo[programCount+1];
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i)
|
cfor (i = 0; i < programCount+1; ++i)
|
||||||
foo[i].f = i*a;
|
foo[i].f = i*a;
|
||||||
RET[programIndex] = func(foo, (int)a);
|
RET[programIndex] = func(foo, (int)a);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ struct Foo {
|
|||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
Foo foo[17];
|
Foo foo[programCount+1];
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i)
|
cfor (i = 0; i < programCount+1; ++i)
|
||||||
foo[i].f = i*a;
|
foo[i].f = i*a;
|
||||||
RET[programIndex] = foo[(int)a].f;
|
RET[programIndex] = foo[(int)a].f;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,9 +10,9 @@ struct Foo {
|
|||||||
export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
|
export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
int b = bFOO[programIndex];
|
int b = bFOO[programIndex];
|
||||||
varying Foo myFoo[17];
|
varying Foo myFoo[programCount+1];
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i) {
|
cfor (i = 0; i < programCount+1; ++i) {
|
||||||
myFoo[i].x = i;
|
myFoo[i].x = i;
|
||||||
myFoo[i].f = 2*i;
|
myFoo[i].f = 2*i;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 2;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 5;
|
RET[i+0] = 2;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 6;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 5;
|
||||||
|
RET[i+3] = 6;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 32;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 32;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 38;
|
RET[i+0] = 32;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 39;
|
RET[i+1] = 32;
|
||||||
|
RET[i+2] = 38;
|
||||||
|
RET[i+3] = 39;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,9 +8,9 @@ struct Foo {
|
|||||||
};
|
};
|
||||||
export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
|
export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
|
||||||
int b = bFOO[programIndex];
|
int b = bFOO[programIndex];
|
||||||
uniform struct Foo myFoo[17];
|
uniform struct Foo myFoo[programCount+1];
|
||||||
uniform int i;
|
uniform int i;
|
||||||
cfor (i = 0; i < 17; ++i) {
|
cfor (i = 0; i < programCount+1; ++i) {
|
||||||
myFoo[i].x = i;
|
myFoo[i].x = i;
|
||||||
myFoo[i].f = 2*i;
|
myFoo[i].f = 2*i;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
||||||
static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
||||||
RET[programIndex] = (x == y) ? 1. : 0.;
|
RET[programIndex] = (x == y) ? 1. : 0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform int x = (170 >> 4) % 5;
|
uniform int x = (170 >> 4) % 5;
|
||||||
static uniform int y = (170 >> 4) % 5;
|
const static uniform int y = (170 >> 4) % 5;
|
||||||
RET[programIndex] = (x == y) ? 1. : 0.;
|
RET[programIndex] = (x == y) ? 1. : 0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
|
uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
|
||||||
static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
|
const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
|
||||||
RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
|
RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,22 +2,23 @@
|
|||||||
export uniform int width() { return programCount; }
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
#define N0 10
|
#define N0 12
|
||||||
#define N1 20
|
#define N1 20
|
||||||
#define N2 50
|
#define N2 50
|
||||||
static uniform float array[N2][N1][N0];
|
static uniform float array[N2][N1][N0];
|
||||||
|
|
||||||
task void x(const float f) {
|
task void x(const uniform float farray[]) {
|
||||||
|
const float f = farray[programIndex];
|
||||||
uniform int j;
|
uniform int j;
|
||||||
|
|
||||||
assert(taskCount == (int32)N0*N1*N2);
|
assert(taskCount == (uniform int32)N0*N1*N2);
|
||||||
assert(taskCount0 == (int32)N0);
|
assert(taskCount0 == (uniform int32)N0);
|
||||||
assert(taskCount1 == (int32)N1);
|
assert(taskCount1 == (uniform int32)N1);
|
||||||
assert(taskCount2 == (int32)N2);
|
assert(taskCount2 == (uniform int32)N2);
|
||||||
assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
|
assert(taskIndex == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
|
||||||
assert(taskIndex0 < (int32)N0);
|
assert(taskIndex0 < (uniform int32)N0);
|
||||||
assert(taskIndex1 < (int32)N1);
|
assert(taskIndex1 < (uniform int32)N1);
|
||||||
assert(taskIndex2 < (int32)N2);
|
assert(taskIndex2 < (uniform int32)N2);
|
||||||
|
|
||||||
const uniform int i0 = taskIndex0;
|
const uniform int i0 = taskIndex0;
|
||||||
const uniform int i1 = taskIndex1;
|
const uniform int i1 = taskIndex1;
|
||||||
@@ -30,7 +31,7 @@ task void x(const float f) {
|
|||||||
array[i2][i1][i0] = i;
|
array[i2][i1][i0] = i;
|
||||||
}
|
}
|
||||||
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
||||||
float f = fFOO[programIndex];
|
uniform float * uniform f = fFOO;
|
||||||
launch[N2][N1][N0] x(f);
|
launch[N2][N1][N0] x(f);
|
||||||
sync;
|
sync;
|
||||||
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
||||||
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[programIndex] = 9999.000000;
|
RET[programIndex] = 11999.000000;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,12 +2,13 @@
|
|||||||
export uniform int width() { return programCount; }
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
#define N0 10
|
#define N0 12
|
||||||
#define N1 20
|
#define N1 20
|
||||||
#define N2 50
|
#define N2 50
|
||||||
static uniform float array[N2][N1][N0];
|
static uniform float array[N2][N1][N0];
|
||||||
|
|
||||||
task void x(const float f) {
|
task void x(const uniform float farray[]) {
|
||||||
|
const float f = farray[programIndex];
|
||||||
uniform int j;
|
uniform int j;
|
||||||
|
|
||||||
assert(taskCount == (int32)N0*N1*N2);
|
assert(taskCount == (int32)N0*N1*N2);
|
||||||
@@ -30,13 +31,13 @@ task void x(const float f) {
|
|||||||
array[i2][i1][i0] = i;
|
array[i2][i1][i0] = i;
|
||||||
}
|
}
|
||||||
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
||||||
float f = fFOO[programIndex];
|
uniform float * uniform f = fFOO;
|
||||||
launch[N0,N1,N2] x(f);
|
launch[N2][N1][N0] x(f);
|
||||||
sync;
|
sync;
|
||||||
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[programIndex] = 9999.000000;
|
RET[programIndex] = 11999.000000;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,9 @@
|
|||||||
|
#ifdef __NVPTX__
|
||||||
|
uniform int _off[programCount];
|
||||||
|
#define off _off[programIndex]
|
||||||
|
#else /* global varying data types are not yet supported with "nvptx" target */
|
||||||
int off;
|
int off;
|
||||||
|
#endif
|
||||||
|
|
||||||
export uniform int width() { return programCount; }
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
struct S a;
|
struct S a;
|
||||||
struct S b;
|
struct S b;
|
||||||
struct S d;
|
struct S d;
|
||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|
||||||
int T = programIndex;
|
int T = programIndex;
|
||||||
a.a = aFOO[programIndex];
|
a.a = aFOO[programIndex];
|
||||||
b.a = -aFOO[programIndex];
|
b.a = -aFOO[programIndex];
|
||||||
|
|||||||
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
|
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||||
|
soa<4> Point pts[10];
|
||||||
|
for (uniform int i = 0; i < 40; ++i) {
|
||||||
|
pts[i].x = b*i;
|
||||||
|
pts[i].y[0] = 2*b*i;
|
||||||
|
pts[i].y[1] = 2*b*i+1;
|
||||||
|
pts[i].y[2] = 2*b*i+2;
|
||||||
|
pts[i].z = 3*b*i;
|
||||||
|
}
|
||||||
|
#else
|
||||||
soa<4> Point pts[30];
|
soa<4> Point pts[30];
|
||||||
for (uniform int i = 0; i < 120; ++i) {
|
for (uniform int i = 0; i < 120; ++i) {
|
||||||
pts[i].x = b*i;
|
pts[i].x = b*i;
|
||||||
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
pts[i].y[2] = 2*b*i+2;
|
pts[i].y[2] = 2*b*i+2;
|
||||||
pts[i].z = 3*b*i;
|
pts[i].z = 3*b*i;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
a *= -1;
|
a *= -1;
|
||||||
|
|||||||
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
|
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||||
|
soa<4> Point pts[15];
|
||||||
|
for (uniform int i = 0; i < 60; ++i) {
|
||||||
|
pts[i].x = b*i;
|
||||||
|
pts[i].y[0] = 2*b*i;
|
||||||
|
pts[i].y[1] = 2*b*i+1;
|
||||||
|
pts[i].y[2] = 2*b*i+2;
|
||||||
|
pts[i].z = 3*b*i;
|
||||||
|
}
|
||||||
|
#else
|
||||||
soa<4> Point pts[40];
|
soa<4> Point pts[40];
|
||||||
for (uniform int i = 0; i < 160; ++i) {
|
for (uniform int i = 0; i < 160; ++i) {
|
||||||
pts[i].x = b*i;
|
pts[i].x = b*i;
|
||||||
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
pts[i].y[2] = 2*b*i+2;
|
pts[i].y[2] = 2*b*i+2;
|
||||||
pts[i].z = 3*b*i;
|
pts[i].z = 3*b*i;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
a *= -1;
|
a *= -1;
|
||||||
|
|||||||
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
|
|
||||||
|
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||||
|
soa<8> Point pts[4];
|
||||||
|
//CO uniform Point pts[80];
|
||||||
|
foreach (i = 0 ... 40) {
|
||||||
|
pts[i].x = b*i;
|
||||||
|
pts[i].y[0] = 2*b*i;
|
||||||
|
pts[i].y[1] = 2*b*i+1;
|
||||||
|
pts[i].y[2] = 2*b*i+2;
|
||||||
|
pts[i].z = 3*b*i;
|
||||||
|
}
|
||||||
|
#else
|
||||||
soa<8> Point pts[10];
|
soa<8> Point pts[10];
|
||||||
//CO uniform Point pts[80];
|
//CO uniform Point pts[80];
|
||||||
foreach (i = 0 ... 80) {
|
foreach (i = 0 ... 80) {
|
||||||
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
pts[i].y[2] = 2*b*i+2;
|
pts[i].y[2] = 2*b*i+2;
|
||||||
pts[i].z = 3*b*i;
|
pts[i].z = 3*b*i;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
assert(programCount < 80);
|
assert(programCount < 80);
|
||||||
RET[programIndex] = pts[programIndex].y[2];
|
RET[programIndex] = pts[programIndex].y[2];
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
|||||||
|
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
RET[i+0] = 1;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
RET[i+1] = 3;
|
||||||
|
RET[i+2] = 3;
|
||||||
|
RET[i+3] = 29;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
|
for (int i = 0; i < programCount; i += 4)
|
||||||
RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
|
{
|
||||||
RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0;
|
RET[i+0] = 0x0.0p+0;
|
||||||
RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
|
RET[i+1] = 0x1.62e43p-1;
|
||||||
|
RET[i+2] = 0x1.193ea8p+0;
|
||||||
|
RET[i+3] = 0x1.62e43p+0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
|
|||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
// calculation error 1e-6 is the same as in icc
|
// calculation error 1e-6 is the same as in icc
|
||||||
RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
|
RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[4]) {
|
export void result(uniform float RET[4]) {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
RET[programIndex] = round(a+.499999);
|
RET[programIndex] = round(a+.49999);
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
|
|||||||
|
|
||||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||||
float a = aFOO[programIndex];
|
float a = aFOO[programIndex];
|
||||||
RET[programIndex] = floor(a+.999999);
|
RET[programIndex] = floor(a+.99999);
|
||||||
}
|
}
|
||||||
|
|
||||||
export void result(uniform float RET[]) {
|
export void result(uniform float RET[]) {
|
||||||
|
|||||||
34
tests/uniform-1.ispc
Normal file
34
tests/uniform-1.ispc
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
|
||||||
|
export uniform int width() { return programCount; }
|
||||||
|
|
||||||
|
|
||||||
|
task void f_f_task(uniform float RET[], uniform float aFOO[]) {
|
||||||
|
uniform float val[programCount];
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
val[i] = 0;
|
||||||
|
|
||||||
|
foreach (i = 0 ... programCount)
|
||||||
|
val[i] += aFOO[programCount*taskIndex + i] - 1;
|
||||||
|
|
||||||
|
uniform float sum = 0;
|
||||||
|
for (uniform int i = 0; i < programCount; ++i)
|
||||||
|
sum += val[i];
|
||||||
|
|
||||||
|
if (programIndex < 32/4)
|
||||||
|
RET[programCount/4*taskIndex + programIndex] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void f_f(uniform float RET[], uniform float aFOO[])
|
||||||
|
{
|
||||||
|
launch[4] f_f_task(RET, aFOO);
|
||||||
|
}
|
||||||
|
task void result_task(uniform float RET[])
|
||||||
|
{
|
||||||
|
const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
|
||||||
|
if (programIndex < 32/4)
|
||||||
|
RET[programCount/4*taskIndex + programIndex] = ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
export void result(uniform float RET[]) {
|
||||||
|
launch[4] result_task(RET);
|
||||||
|
}
|
||||||
10
type.cpp
10
type.cpp
@@ -749,7 +749,7 @@ EnumType::Mangle() const {
|
|||||||
std::string ret;
|
std::string ret;
|
||||||
if (isConst) ret += "C";
|
if (isConst) ret += "C";
|
||||||
ret += variability.MangleString();
|
ret += variability.MangleString();
|
||||||
ret += std::string("enum[") + name + std::string("]");
|
ret += std::string("enum_5B_") + name + std::string("_5C_");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
|
|||||||
sprintf(buf, "%d", numElements);
|
sprintf(buf, "%d", numElements);
|
||||||
else
|
else
|
||||||
buf[0] = '\0';
|
buf[0] = '\0';
|
||||||
return s + "[" + buf + "]";
|
return s + "_5B_" + buf + "_5C_";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
|
|||||||
Assert(variability != Variability::Unbound);
|
Assert(variability != Variability::Unbound);
|
||||||
|
|
||||||
std::string ret;
|
std::string ret;
|
||||||
ret += "s[";
|
ret += "s_5B_";
|
||||||
if (isConst)
|
if (isConst)
|
||||||
ret += "_c_";
|
ret += "_c_";
|
||||||
ret += variability.MangleString();
|
ret += variability.MangleString();
|
||||||
|
|
||||||
ret += name + std::string("]");
|
ret += name + std::string("_5C_");
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
|
|||||||
llvmArgTypes.push_back(LLVMTypes::MaskType);
|
llvmArgTypes.push_back(LLVMTypes::MaskType);
|
||||||
|
|
||||||
std::vector<llvm::Type *> callTypes;
|
std::vector<llvm::Type *> callTypes;
|
||||||
if (isTask) {
|
if (isTask && g->target->getISA() != Target::NVPTX) {
|
||||||
// Tasks take three arguments: a pointer to a struct that holds the
|
// Tasks take three arguments: a pointer to a struct that holds the
|
||||||
// actual task arguments, the thread index, and the total number of
|
// actual task arguments, the thread index, and the total number of
|
||||||
// threads the tasks system has running. (Task arguments are
|
// threads the tasks system has running. (Task arguments are
|
||||||
|
|||||||
Reference in New Issue
Block a user