merged with nvptx
This commit is contained in:
83
builtins.cpp
83
builtins.cpp
@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__all",
|
||||
"__any",
|
||||
"__aos_to_soa3_float",
|
||||
"__aos_to_soa3_float1",
|
||||
"__aos_to_soa3_float16",
|
||||
"__aos_to_soa3_float4",
|
||||
"__aos_to_soa3_float8",
|
||||
"__aos_to_soa3_int32",
|
||||
"__aos_to_soa4_float",
|
||||
"__aos_to_soa4_float1",
|
||||
"__aos_to_soa4_float16",
|
||||
"__aos_to_soa4_float4",
|
||||
"__aos_to_soa4_float8",
|
||||
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_add_int64_global",
|
||||
"__atomic_add_uniform_int32_global",
|
||||
"__atomic_add_uniform_int64_global",
|
||||
"__atomic_add_varying_int32_global",
|
||||
"__atomic_add_varying_int64_global",
|
||||
"__atomic_and_int32_global",
|
||||
"__atomic_and_int64_global",
|
||||
"__atomic_and_uniform_int32_global",
|
||||
"__atomic_and_uniform_int64_global",
|
||||
"__atomic_and_varying_int32_global",
|
||||
"__atomic_and_varying_int64_global",
|
||||
"__atomic_compare_exchange_double_global",
|
||||
"__atomic_compare_exchange_float_global",
|
||||
"__atomic_compare_exchange_int32_global",
|
||||
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_compare_exchange_uniform_float_global",
|
||||
"__atomic_compare_exchange_uniform_int32_global",
|
||||
"__atomic_compare_exchange_uniform_int64_global",
|
||||
"__atomic_compare_exchange_varying_double_global",
|
||||
"__atomic_compare_exchange_varying_float_global",
|
||||
"__atomic_compare_exchange_varying_int32_global",
|
||||
"__atomic_compare_exchange_varying_int64_global",
|
||||
"__atomic_max_uniform_int32_global",
|
||||
"__atomic_max_uniform_int64_global",
|
||||
"__atomic_min_uniform_int32_global",
|
||||
"__atomic_min_uniform_int64_global",
|
||||
"__atomic_max_varying_int32_global",
|
||||
"__atomic_max_varying_int64_global",
|
||||
"__atomic_min_varying_int32_global",
|
||||
"__atomic_min_varying_int64_global",
|
||||
"__atomic_or_int32_global",
|
||||
"__atomic_or_int64_global",
|
||||
"__atomic_or_uniform_int32_global",
|
||||
"__atomic_or_uniform_int64_global",
|
||||
"__atomic_or_varying_int32_global",
|
||||
"__atomic_or_varying_int64_global",
|
||||
"__atomic_sub_int32_global",
|
||||
"__atomic_sub_int64_global",
|
||||
"__atomic_sub_uniform_int32_global",
|
||||
"__atomic_sub_uniform_int64_global",
|
||||
"__atomic_sub_varying_int32_global",
|
||||
"__atomic_sub_varying_int64_global",
|
||||
"__atomic_swap_double_global",
|
||||
"__atomic_swap_float_global",
|
||||
"__atomic_swap_int32_global",
|
||||
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__atomic_swap_uniform_float_global",
|
||||
"__atomic_swap_uniform_int32_global",
|
||||
"__atomic_swap_uniform_int64_global",
|
||||
"__atomic_swap_varying_double_global",
|
||||
"__atomic_swap_varying_float_global",
|
||||
"__atomic_swap_varying_int32_global",
|
||||
"__atomic_swap_varying_int64_global",
|
||||
"__atomic_umax_uniform_uint32_global",
|
||||
"__atomic_umax_uniform_uint64_global",
|
||||
"__atomic_umin_uniform_uint32_global",
|
||||
"__atomic_umin_uniform_uint64_global",
|
||||
"__atomic_umax_varying_uint32_global",
|
||||
"__atomic_umax_varying_uint64_global",
|
||||
"__atomic_umin_varying_uint32_global",
|
||||
"__atomic_umin_varying_uint64_global",
|
||||
"__atomic_xor_int32_global",
|
||||
"__atomic_xor_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__atomic_xor_uniform_int32_global",
|
||||
"__atomic_xor_uniform_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
"__atomic_xor_varying_int32_global",
|
||||
"__atomic_xor_varying_int64_global",
|
||||
"__broadcast_double",
|
||||
"__broadcast_float",
|
||||
"__broadcast_i16",
|
||||
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__do_assert_uniform",
|
||||
"__do_assert_varying",
|
||||
"__do_print",
|
||||
"__do_print_nvptx",
|
||||
"__doublebits_uniform_int64",
|
||||
"__doublebits_varying_int64",
|
||||
"__exclusive_scan_add_double",
|
||||
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__extract_int32",
|
||||
"__extract_int64",
|
||||
"__extract_int8",
|
||||
"__extract_float",
|
||||
"__extract_double",
|
||||
"__fastmath",
|
||||
"__float_to_half_uniform",
|
||||
"__float_to_half_varying",
|
||||
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__insert_int32",
|
||||
"__insert_int64",
|
||||
"__insert_int8",
|
||||
"__insert_float",
|
||||
"__insert_double",
|
||||
"__intbits_uniform_double",
|
||||
"__intbits_uniform_float",
|
||||
"__intbits_varying_double",
|
||||
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__min_varying_uint32",
|
||||
"__min_varying_uint64",
|
||||
"__movmsk",
|
||||
"__movmsk_ptx",
|
||||
"__new_uniform_32rt",
|
||||
"__new_uniform_64rt",
|
||||
"__new_varying32_32rt",
|
||||
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__shuffle_i64",
|
||||
"__shuffle_i8",
|
||||
"__soa_to_aos3_float",
|
||||
"__soa_to_aos3_float1",
|
||||
"__soa_to_aos3_float16",
|
||||
"__soa_to_aos3_float4",
|
||||
"__soa_to_aos3_float8",
|
||||
"__soa_to_aos3_int32",
|
||||
"__soa_to_aos4_float",
|
||||
"__soa_to_aos4_float1",
|
||||
"__soa_to_aos4_float16",
|
||||
"__soa_to_aos4_float4",
|
||||
"__soa_to_aos4_float8",
|
||||
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__vec4_add_int32",
|
||||
"__vselect_float",
|
||||
"__vselect_i32",
|
||||
"__program_index",
|
||||
"__program_count",
|
||||
"__warp_index",
|
||||
"__task_index0",
|
||||
"__task_index1",
|
||||
"__task_index2",
|
||||
"__task_index",
|
||||
"__task_count0",
|
||||
"__task_count1",
|
||||
"__task_count2",
|
||||
"__task_count",
|
||||
"__cvt_loc2gen",
|
||||
"__cvt_loc2gen_var",
|
||||
"__cvt_const2gen",
|
||||
"__puts_nvptx",
|
||||
"ISPCAlloc",
|
||||
"ISPCLaunch",
|
||||
"ISPCSync",
|
||||
};
|
||||
|
||||
int count = sizeof(names) / sizeof(names[0]);
|
||||
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
|
||||
g->target->getISA() != Target::NEON16 &&
|
||||
g->target->getISA() != Target::NEON8)
|
||||
#endif // !__arm__
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
|
||||
mTriple.getArch() == bcTriple.getArch());
|
||||
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
// Next, add the target's custom implementations of the various needed
|
||||
// builtin functions (e.g. __masked_store_32(), etc).
|
||||
switch (g->target->getISA()) {
|
||||
|
||||
case Target::NVPTX:
|
||||
{
|
||||
if (runtime32) {
|
||||
fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
|
||||
assert(0);
|
||||
}
|
||||
else {
|
||||
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
|
||||
}
|
||||
break;
|
||||
};
|
||||
#ifdef ISPC_ARM_ENABLED
|
||||
case Target::NEON8: {
|
||||
if (runtime32) {
|
||||
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
}
|
||||
|
||||
// define the 'programCount' builtin variable
|
||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
|
||||
}
|
||||
else
|
||||
{
|
||||
lDefineConstantInt("programCount", 32, module, symbolTable);
|
||||
}
|
||||
|
||||
// define the 'programIndex' builtin
|
||||
lDefineProgramIndex(module, symbolTable);
|
||||
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
|
||||
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
|
||||
module, symbolTable);
|
||||
|
||||
lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
|
||||
module, symbolTable);
|
||||
|
||||
if (g->forceAlignment != -1) {
|
||||
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
|
||||
alignment->setInitializer(LLVMInt32(g->forceAlignment));
|
||||
|
||||
130
builtins/__do_print_nvptx.cu
Normal file
130
builtins/__do_print_nvptx.cu
Normal file
@@ -0,0 +1,130 @@
|
||||
#include <cstdio>
|
||||
|
||||
#define PRINT_BUF_SIZE 4096
|
||||
#define uint64_t unsigned long long
|
||||
|
||||
static __device__ size_t d_strlen(const char *str)
|
||||
{
|
||||
const char *s;
|
||||
|
||||
for (s = str; *s; ++s)
|
||||
;
|
||||
return (s - str);
|
||||
}
|
||||
|
||||
static __device__ char* d_strncat(char *dest, const char *src, size_t n)
|
||||
{
|
||||
size_t dest_len = d_strlen(dest);
|
||||
size_t i;
|
||||
|
||||
for (i = 0 ; i < n && src[i] != '\0' ; i++)
|
||||
dest[dest_len + i] = src[i];
|
||||
dest[dest_len + i] = '\0';
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
#define APPEND(str) \
|
||||
do { \
|
||||
int offset = bufp - &printString[0]; \
|
||||
*bufp = '\0'; \
|
||||
d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \
|
||||
bufp += d_strlen(str); \
|
||||
if (bufp >= &printString[PRINT_BUF_SIZE]) \
|
||||
goto done; \
|
||||
} while (0) /* eat semicolon */
|
||||
|
||||
|
||||
#define PRINT_SCALAR(fmt, type) \
|
||||
sprintf(tmpBuf, fmt, *((type *)ptr)); \
|
||||
APPEND(tmpBuf); \
|
||||
break
|
||||
|
||||
#define PRINT_VECTOR(fmt, type) \
|
||||
*bufp++ = '['; \
|
||||
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
|
||||
for (int i = 0; i < width; ++i) { \
|
||||
/* only print the value if the current lane is executing */ \
|
||||
type val0 = *((type*)ptr); \
|
||||
type val = val0; \
|
||||
if (mask & (1ull<<i)) \
|
||||
sprintf(tmpBuf, fmt, val); \
|
||||
else \
|
||||
sprintf(tmpBuf, "(( * )) "); \
|
||||
APPEND(tmpBuf); \
|
||||
*bufp++ = (i != width-1 ? ',' : ']'); \
|
||||
} \
|
||||
break
|
||||
|
||||
extern "C"
|
||||
__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
const char trueBuf[] = "true";
|
||||
const char falseBuf[] = "false";
|
||||
|
||||
int argCount = 0;
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
// Based on the encoding in the types string, cast the
|
||||
// value appropriately and print it with a reasonable
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
const char *tmpBuf1 = *((bool *)ptr) ? trueBuf : falseBuf;
|
||||
APPEND(tmpBuf1);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
bool val0 = *((bool*)ptr);
|
||||
bool val = val0; \
|
||||
if (mask & (1ull << i)) {
|
||||
const char *tmpBuf1 = val ? trueBuf : falseBuf;
|
||||
APPEND(tmpBuf1);
|
||||
}
|
||||
else
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i': PRINT_SCALAR("%d", int);
|
||||
case 'I': PRINT_VECTOR("%d", int);
|
||||
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||
case 'f': PRINT_SCALAR("%f", float);
|
||||
case 'F': PRINT_VECTOR("%f", float);
|
||||
case 'l': PRINT_SCALAR("%lld", long long);
|
||||
case 'L': PRINT_VECTOR("%lld", long long);
|
||||
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\n'; bufp++;
|
||||
*bufp = '\0';
|
||||
}
|
||||
@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
/* this is print for PTX target only */
|
||||
int __puts_nvptx(const char *);
|
||||
void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
|
||||
void **args) {
|
||||
#if 0
|
||||
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
|
||||
char *bufp = &printString[0];
|
||||
char tmpBuf[256];
|
||||
|
||||
int argCount = 0;
|
||||
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
|
||||
// Format strings are just single percent signs.
|
||||
if (*format != '%') {
|
||||
*bufp++ = *format;
|
||||
}
|
||||
else {
|
||||
if (*types) {
|
||||
void *ptr = args[argCount++];
|
||||
// Based on the encoding in the types string, cast the
|
||||
// value appropriately and print it with a reasonable
|
||||
// printf() formatting string.
|
||||
switch (*types) {
|
||||
case 'b': {
|
||||
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
break;
|
||||
}
|
||||
case 'B': {
|
||||
*bufp++ = '[';
|
||||
if (bufp == &printString[PRINT_BUF_SIZE])
|
||||
break;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
if (mask & (1ull << i)) {
|
||||
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
|
||||
APPEND(tmpBuf);
|
||||
}
|
||||
else
|
||||
APPEND("_________");
|
||||
*bufp++ = (i != width-1) ? ',' : ']';
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 'i': PRINT_SCALAR("%d", int);
|
||||
case 'I': PRINT_VECTOR("%d", int);
|
||||
case 'u': PRINT_SCALAR("%u", unsigned int);
|
||||
case 'U': PRINT_VECTOR("%u", unsigned int);
|
||||
case 'f': PRINT_SCALAR("%f", float);
|
||||
case 'F': PRINT_VECTOR("%f", float);
|
||||
case 'l': PRINT_SCALAR("%lld", long long);
|
||||
case 'L': PRINT_VECTOR("%lld", long long);
|
||||
case 'v': PRINT_SCALAR("%llu", unsigned long long);
|
||||
case 'V': PRINT_VECTOR("%llu", unsigned long long);
|
||||
case 'd': PRINT_SCALAR("%f", double);
|
||||
case 'D': PRINT_VECTOR("%f", double);
|
||||
case 'p': PRINT_SCALAR("%p", void *);
|
||||
case 'P': PRINT_VECTOR("%p", void *);
|
||||
default:
|
||||
APPEND("UNKNOWN TYPE ");
|
||||
*bufp++ = *types;
|
||||
}
|
||||
++types;
|
||||
}
|
||||
}
|
||||
++format;
|
||||
}
|
||||
|
||||
done:
|
||||
*bufp = '\n'; bufp++;
|
||||
*bufp = '\0';
|
||||
__puts_nvptx(printString);
|
||||
#else
|
||||
__puts_nvptx("---nvptx printing is not support---\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
int __num_cores() {
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
|
||||
@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
declare_nvptx()
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ packed_load_and_store()
|
||||
scans()
|
||||
int64minmax()
|
||||
aossoa()
|
||||
declare_nvptx()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; masked store
|
||||
|
||||
@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
|
||||
;; int8/int16 builtins
|
||||
|
||||
define_avgs()
|
||||
|
||||
declare_nvptx()
|
||||
|
||||
@@ -344,3 +344,4 @@ packed_load_and_store(4)
|
||||
;; prefetch
|
||||
|
||||
define_prefetches()
|
||||
declare_nvptx()
|
||||
|
||||
2235
builtins/target-nvptx.ll
Normal file
2235
builtins/target-nvptx.ll
Normal file
File diff suppressed because it is too large
Load Diff
@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
|
||||
|
||||
define_avgs()
|
||||
|
||||
declare_nvptx()
|
||||
|
||||
@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
|
||||
%call = call i64 @llvm.ctpop.i64(i64 %0)
|
||||
ret i64 %call
|
||||
}
|
||||
|
||||
declare_nvptx()
|
||||
|
||||
3417
builtins/util-nvptx.m4
Normal file
3417
builtins/util-nvptx.m4
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
|
||||
declare double @__rcp_uniform_double(double)
|
||||
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
|
||||
')
|
||||
|
||||
define(`declare_nvptx',
|
||||
`
|
||||
declare i32 @__program_index() nounwind readnone alwaysinline
|
||||
declare i32 @__program_count() nounwind readnone alwaysinline
|
||||
declare i32 @__warp_index() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index0() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index1() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index2() nounwind readnone alwaysinline
|
||||
declare i32 @__task_index() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count0() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count1() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count2() nounwind readnone alwaysinline
|
||||
declare i32 @__task_count() nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
|
||||
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
|
||||
declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
|
||||
')
|
||||
|
||||
define(`global_atomic_varying',`
|
||||
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||
')
|
||||
|
||||
define(`global_atomic_cas_varying',`
|
||||
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
|
||||
')
|
||||
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
|
||||
global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
|
||||
|
||||
global_atomic_varying(WIDTH, swap, i32, int32)
|
||||
global_atomic_varying(WIDTH, swap, i64, int64)
|
||||
global_atomic_varying(WIDTH, swap, float, float)
|
||||
global_atomic_varying(WIDTH, swap, double, double)
|
||||
|
||||
global_atomic_varying(WIDTH, add, i32, int32)
|
||||
global_atomic_varying(WIDTH, sub, i32, int32)
|
||||
global_atomic_varying(WIDTH, and, i32, int32)
|
||||
global_atomic_varying(WIDTH, or, i32, int32)
|
||||
global_atomic_varying(WIDTH, xor, i32, int32)
|
||||
global_atomic_varying(WIDTH, min, i32, int32)
|
||||
global_atomic_varying(WIDTH, max, i32, int32)
|
||||
global_atomic_varying(WIDTH, umin, i32, uint32)
|
||||
global_atomic_varying(WIDTH, umax, i32, uint32)
|
||||
|
||||
global_atomic_varying(WIDTH, add, i64, int64)
|
||||
global_atomic_varying(WIDTH, sub, i64, int64)
|
||||
global_atomic_varying(WIDTH, and, i64, int64)
|
||||
global_atomic_varying(WIDTH, or, i64, int64)
|
||||
global_atomic_varying(WIDTH, xor, i64, int64)
|
||||
global_atomic_varying(WIDTH, min, i64, int64)
|
||||
global_atomic_varying(WIDTH, max, i64, int64)
|
||||
global_atomic_varying(WIDTH, umin, i64, uint64)
|
||||
global_atomic_varying(WIDTH, umax, i64, uint64)
|
||||
|
||||
338
ctx.cpp
338
ctx.cpp
@@ -57,6 +57,8 @@
|
||||
#include <llvm/IR/Instructions.h>
|
||||
#include <llvm/IR/DerivedTypes.h>
|
||||
#endif
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <llvm/Support/FormattedStream.h>
|
||||
|
||||
/** This is a small utility structure that records information related to one
|
||||
level of nested control flow. It's mostly used in correctly restoring
|
||||
@@ -1371,29 +1373,97 @@ FunctionEmitContext::None(llvm::Value *mask) {
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::LaneMask(llvm::Value *v) {
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
// into an i64 value
|
||||
std::vector<Symbol *> mm;
|
||||
m->symbolTable->LookupFunction("__movmsk", &mm);
|
||||
if (g->target->getMaskBitCount() == 1)
|
||||
AssertPos(currentPos, mm.size() == 1);
|
||||
else
|
||||
// There should be one with signed int signature, one unsigned int.
|
||||
AssertPos(currentPos, mm.size() == 2);
|
||||
// We can actually call either one, since both are i32s as far as
|
||||
// LLVM's type system is concerned...
|
||||
llvm::Function *fmm = mm[0]->function;
|
||||
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
|
||||
FunctionEmitContext::LaneMask(llvm::Value *v)
|
||||
{
|
||||
#if 1 /* this makes mandelbrot example slower, why ?!? */
|
||||
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
|
||||
#else
|
||||
const char *__movmsk = "__movmsk";
|
||||
#endif
|
||||
// Call the target-dependent movmsk function to turn the vector mask
|
||||
// into an i64 value
|
||||
std::vector<Symbol *> mm;
|
||||
m->symbolTable->LookupFunction(__movmsk, &mm);
|
||||
if (g->target->getMaskBitCount() == 1)
|
||||
AssertPos(currentPos, mm.size() == 1);
|
||||
else
|
||||
// There should be one with signed int signature, one unsigned int.
|
||||
AssertPos(currentPos, mm.size() == 2);
|
||||
// We can actually call either one, since both are i32s as far as
|
||||
// LLVM's type system is concerned...
|
||||
llvm::Function *fmm = mm[0]->function;
|
||||
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
|
||||
}
|
||||
|
||||
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
|
||||
{
|
||||
llvm::Type *type = vector->getType();
|
||||
if (type == LLVMTypes::Int8VectorType)
|
||||
funcName += "_int8";
|
||||
else if (type == LLVMTypes::Int16VectorType)
|
||||
funcName += "_int16";
|
||||
else if (type == LLVMTypes::Int32VectorType)
|
||||
funcName += "_int32";
|
||||
else if (type == LLVMTypes::Int64VectorType)
|
||||
funcName += "_int64";
|
||||
else if (type == LLVMTypes::FloatVectorType)
|
||||
funcName += "_float";
|
||||
else if (type == LLVMTypes::DoubleVectorType)
|
||||
funcName += "_double";
|
||||
else
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
llvm::Value*
|
||||
FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
|
||||
{
|
||||
std::string funcName = "__insert";
|
||||
assert(lAppendInsertExtractName(vector, funcName));
|
||||
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||
|
||||
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||
assert(func != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(vector);
|
||||
args.push_back(lane);
|
||||
args.push_back(scalar);
|
||||
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||
return ret;
|
||||
}
|
||||
|
||||
llvm::Value*
|
||||
FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
|
||||
{
|
||||
std::string funcName = "__extract";
|
||||
assert(lAppendInsertExtractName(vector, funcName));
|
||||
assert(lane->getType() == LLVMTypes::Int32Type);
|
||||
|
||||
llvm::Function *func = m->module->getFunction(funcName.c_str());
|
||||
assert(func != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(vector);
|
||||
args.push_back(lane);
|
||||
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
// Compare the two masks to get a vector of i1s
|
||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
v1, v2, "v1==v2");
|
||||
return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */
|
||||
}
|
||||
else
|
||||
{
|
||||
#if 0
|
||||
// Compare the two masks to get a vector of i1s
|
||||
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
v1, v2, "v1==v2");
|
||||
v1, v2, "v1==v2");
|
||||
// Turn that into a bool vector type (often i32s)
|
||||
cmp = I1VecToBoolVec(cmp);
|
||||
// And see if it's all on
|
||||
@@ -1402,22 +1472,34 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
|
||||
llvm::Value *mm1 = LaneMask(v1);
|
||||
llvm::Value *mm2 = LaneMask(v2);
|
||||
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
|
||||
LLVMGetName("equal", v1, v2));
|
||||
LLVMGetName("equal", v1, v2));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value *
|
||||
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
|
||||
llvm::SmallVector<llvm::Constant*, 16> array;
|
||||
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
|
||||
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
|
||||
array.push_back(C);
|
||||
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
|
||||
array.push_back(C);
|
||||
}
|
||||
|
||||
llvm::Constant* index = llvm::ConstantVector::get(array);
|
||||
|
||||
return index;
|
||||
}
|
||||
llvm::Value *
|
||||
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
|
||||
llvm::Function *func_program_index = m->module->getFunction("__program_index");
|
||||
llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
|
||||
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
|
||||
#if 0
|
||||
if (!is32bits)
|
||||
index = ZExtInst(index, LLVMTypes::Int64VectandType);
|
||||
#endif
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
llvm::Value *
|
||||
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
|
||||
|
||||
if (name == NULL)
|
||||
name = LLVMGetName(value, "_ptr2int");
|
||||
|
||||
llvm::Type *type = LLVMTypes::PointerIntType;
|
||||
llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
|
||||
AddDebugPos(inst);
|
||||
@@ -3523,98 +3606,199 @@ llvm::Value *
|
||||
FunctionEmitContext::LaunchInst(llvm::Value *callee,
|
||||
std::vector<llvm::Value *> &argVals,
|
||||
llvm::Value *launchCount[3]){
|
||||
if (callee == NULL) {
|
||||
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
if (callee == NULL) {
|
||||
AssertPos(currentPos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
launchedTasks = true;
|
||||
launchedTasks = true;
|
||||
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
llvm::Type *argType =
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
llvm::Type *argType =
|
||||
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
|
||||
AssertPos(currentPos, llvm::PointerType::classof(argType));
|
||||
llvm::PointerType *pt =
|
||||
AssertPos(currentPos, llvm::PointerType::classof(argType));
|
||||
llvm::PointerType *pt =
|
||||
llvm::dyn_cast<llvm::PointerType>(argType);
|
||||
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
|
||||
llvm::StructType *argStructType =
|
||||
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
|
||||
llvm::StructType *argStructType =
|
||||
static_cast<llvm::StructType *>(pt->getElementType());
|
||||
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
|
||||
// targets, SizeOf returns a 32-bit value
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
|
||||
"struct_size_to_64");
|
||||
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
|
||||
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(structSize);
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(structSize);
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
// the argument block
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i) {
|
||||
// Copy the values of the parameters into the appropriate place in
|
||||
// the argument block
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i) {
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
}
|
||||
}
|
||||
|
||||
if (argStructType->getNumElements() == argVals.size() + 1) {
|
||||
if (argStructType->getNumElements() == argVals.size() + 1) {
|
||||
// copy in the mask
|
||||
llvm::Value *mask = GetFullMask();
|
||||
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||
"funarg_mask");
|
||||
"funarg_mask");
|
||||
StoreInst(mask, ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// And emit the call to the user-supplied task launch function, passing
|
||||
// a pointer to the task function being called and a pointer to the
|
||||
// argument block we just filled in
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
AssertPos(currentPos, flaunch != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
return CallInst(flaunch, NULL, args, "");
|
||||
// And emit the call to the user-supplied task launch function, passing
|
||||
// a pointer to the task function being called and a pointer to the
|
||||
// argument block we just filled in
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
AssertPos(currentPos, flaunch != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
return CallInst(flaunch, NULL, args, "");
|
||||
}
|
||||
else /* NVPTX */
|
||||
{
|
||||
if (callee == NULL) {
|
||||
AssertPos(currentPos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
launchedTasks = true;
|
||||
|
||||
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
|
||||
std::vector<llvm::Type*> argTypes;
|
||||
|
||||
llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
|
||||
const unsigned int nArgs = F->arg_size();
|
||||
llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
|
||||
for (; I != E; ++I)
|
||||
argTypes.push_back(I->getType());
|
||||
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
|
||||
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
|
||||
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
|
||||
if (structSize->getType() != LLVMTypes::Int64Type)
|
||||
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
|
||||
"struct_size_to_64");
|
||||
|
||||
const int align = 8;
|
||||
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
|
||||
AssertPos(currentPos, falloc != NULL);
|
||||
std::vector<llvm::Value *> allocArgs;
|
||||
allocArgs.push_back(launchGroupHandlePtr);
|
||||
allocArgs.push_back(structSize);
|
||||
allocArgs.push_back(LLVMInt32(align));
|
||||
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
|
||||
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
|
||||
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
|
||||
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
|
||||
|
||||
/* check if the pointer returned by ISPCAlloc is not NULL
|
||||
* --------------
|
||||
* this is a workaround for not checking the value of programIndex
|
||||
* because ISPCAlloc will return NULL pointer for all programIndex > 0
|
||||
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
|
||||
* will also be NULL
|
||||
* This check must be added, and also rewrite the code to make it less opaque
|
||||
*/
|
||||
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
|
||||
BranchInst(if_true, if_false, cmp1);
|
||||
|
||||
/**********************/
|
||||
bblock = if_true;
|
||||
|
||||
// label_if_then block:
|
||||
llvm::Type *pt = llvm::PointerType::getUnqual(st);
|
||||
llvm::Value *argmem = BitCastInst(voidmem, pt);
|
||||
for (unsigned int i = 0; i < argVals.size(); ++i)
|
||||
{
|
||||
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
|
||||
// don't need to do masked store here, I think
|
||||
StoreInst(argVals[i], ptr);
|
||||
}
|
||||
if (nArgs == argVals.size() + 1) {
|
||||
// copy in the mask
|
||||
llvm::Value *mask = GetFullMask();
|
||||
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
|
||||
"funarg_mask");
|
||||
StoreInst(mask, ptr);
|
||||
}
|
||||
BranchInst(if_false);
|
||||
|
||||
/**********************/
|
||||
bblock = if_false;
|
||||
|
||||
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
|
||||
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
|
||||
AssertPos(currentPos, flaunch != NULL);
|
||||
std::vector<llvm::Value *> args;
|
||||
args.push_back(launchGroupHandlePtr);
|
||||
args.push_back(fptr);
|
||||
args.push_back(voidmem);
|
||||
args.push_back(launchCount[0]);
|
||||
args.push_back(launchCount[1]);
|
||||
args.push_back(launchCount[2]);
|
||||
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
FunctionEmitContext::SyncInst() {
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||
llvm::Value *nullPtrValue =
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||
llvm::Value *nullPtrValue =
|
||||
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
|
||||
llvm::CmpInst::ICMP_NE,
|
||||
launchGroupHandle, nullPtrValue);
|
||||
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
|
||||
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
|
||||
BranchInst(bSync, bPostSync, nonNull);
|
||||
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
|
||||
llvm::CmpInst::ICMP_NE,
|
||||
launchGroupHandle, nullPtrValue);
|
||||
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
|
||||
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
|
||||
BranchInst(bSync, bPostSync, nonNull);
|
||||
|
||||
SetCurrentBasicBlock(bSync);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
SetCurrentBasicBlock(bSync);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
CallInst(fsync, NULL, launchGroupHandle, "");
|
||||
CallInst(fsync, NULL, launchGroupHandle, "");
|
||||
|
||||
// zero out the handle so that if ISPCLaunch is called again in this
|
||||
// function, it knows it's starting out from scratch
|
||||
StoreInst(nullPtrValue, launchGroupHandlePtr);
|
||||
// zero out the handle so that if ISPCLaunch is called again in this
|
||||
// function, it knows it's starting out from scratch
|
||||
StoreInst(nullPtrValue, launchGroupHandlePtr);
|
||||
|
||||
BranchInst(bPostSync);
|
||||
BranchInst(bPostSync);
|
||||
|
||||
SetCurrentBasicBlock(bPostSync);
|
||||
SetCurrentBasicBlock(bPostSync);
|
||||
}
|
||||
else /* NVPTX: don't do test, just call sync */
|
||||
{
|
||||
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
|
||||
llvm::Value *nullPtrValue =
|
||||
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
|
||||
llvm::Function *fsync = m->module->getFunction("ISPCSync");
|
||||
if (fsync == NULL)
|
||||
FATAL("Couldn't find ISPCSync declaration?!");
|
||||
CallInst(fsync, NULL, launchGroupHandle, "");
|
||||
StoreInst(nullPtrValue, launchGroupHandlePtr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
8
ctx.h
8
ctx.h
@@ -291,6 +291,13 @@ public:
|
||||
of the mask is on. */
|
||||
llvm::Value *LaneMask(llvm::Value *mask);
|
||||
|
||||
|
||||
/** Issues a call to __insert_int8/int16/int32/int64/float/double */
|
||||
llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
|
||||
/** Issues a call to __extract_int8/int16/int32/int64/float/double */
|
||||
llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
|
||||
|
||||
|
||||
/** Given two masks of type LLVMTypes::MaskType, return an i1 value
|
||||
that indicates whether the two masks are equal. */
|
||||
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
|
||||
@@ -298,6 +305,7 @@ public:
|
||||
/** Generate ConstantVector, which contains ProgramIndex, i.e.
|
||||
< i32 0, i32 1, i32 2, i32 3> */
|
||||
llvm::Value *ProgramIndexVector(bool is32bits = true);
|
||||
llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
|
||||
|
||||
/** Given a string, create an anonymous global variable to hold its
|
||||
value and return the pointer to the string. */
|
||||
|
||||
22
decl.cpp
22
decl.cpp
@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
|
||||
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
|
||||
|
||||
if (soaWidth > 0) {
|
||||
#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
const StructType *st = CastType<StructType>(retType);
|
||||
|
||||
if (st == NULL) {
|
||||
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if 0 /* NVPTX */
|
||||
if (baseType->IsUniformType())
|
||||
{
|
||||
fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize,
|
||||
baseType->IsArrayType() ? " true " : " false ");
|
||||
}
|
||||
#endif
|
||||
const Type *arrayType = new ArrayType(baseType, arraySize);
|
||||
if (child != NULL) {
|
||||
child->InitFromType(arrayType, ds);
|
||||
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
|
||||
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
|
||||
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
|
||||
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
|
||||
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
|
||||
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
|
||||
|
||||
if (isExported && isTask) {
|
||||
@@ -541,9 +555,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
|
||||
return;
|
||||
}
|
||||
if (isExternC && isTask) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return;
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
|
||||
"qualifiers");
|
||||
return;
|
||||
}
|
||||
if (isExternC && isExported) {
|
||||
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "
|
||||
|
||||
11
expr.cpp
11
expr.cpp
@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
|
||||
"struct type \"%s\".", type->GetString().c_str());
|
||||
return NULL;
|
||||
}
|
||||
if (type != NULL)
|
||||
if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
|
||||
{
|
||||
Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
|
||||
AssertPos(pos, m->errorCount > 0);
|
||||
return NULL;
|
||||
}
|
||||
if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
|
||||
{
|
||||
Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
|
||||
return NULL;
|
||||
}
|
||||
if (CastType<UndefinedStructType>(allocType) != NULL) {
|
||||
Error(pos, "Can't dynamically allocate storage for declared "
|
||||
"but not defined type \"%s\".", allocType->GetString().c_str());
|
||||
|
||||
41
func.cpp
41
func.cpp
@@ -47,6 +47,7 @@
|
||||
#include <stdio.h>
|
||||
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2)
|
||||
#include <llvm/Metadata.h>
|
||||
#include <llvm/LLVMContext.h>
|
||||
#include <llvm/Module.h>
|
||||
#include <llvm/Type.h>
|
||||
@@ -54,6 +55,7 @@
|
||||
#include <llvm/Intrinsics.h>
|
||||
#include <llvm/DerivedTypes.h>
|
||||
#else
|
||||
#include <llvm/IR/Metadata.h>
|
||||
#include <llvm/IR/LLVMContext.h>
|
||||
#include <llvm/IR/Module.h>
|
||||
#include <llvm/IR/Type.h>
|
||||
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
|
||||
sym->parentFunction = this;
|
||||
}
|
||||
|
||||
if (type->isTask) {
|
||||
if (type->isTask && g->target->getISA() != Target::NVPTX) {
|
||||
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
|
||||
Assert(threadIndexSym);
|
||||
threadCountSym = m->symbolTable->LookupVariable("threadCount");
|
||||
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
#endif
|
||||
const FunctionType *type = CastType<FunctionType>(sym->type);
|
||||
Assert(type != NULL);
|
||||
if (type->isTask == true) {
|
||||
if (type->isTask == true && g->target->getISA() != Target::NVPTX) {
|
||||
// For tasks, there should always be three parameters: the
|
||||
// pointer to the structure that holds all of the arguments, the
|
||||
// thread index, and the thread count variables.
|
||||
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
|
||||
ctx->SetFunctionMask(argIter);
|
||||
Assert(++argIter == function->arg_end());
|
||||
}
|
||||
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(function);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(LLVMInt32(1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
}
|
||||
}
|
||||
|
||||
// Finally, we can generate code for the function
|
||||
@@ -497,8 +509,23 @@ Function::GenerateIR() {
|
||||
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
|
||||
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
|
||||
std::string functionName = sym->name;
|
||||
|
||||
if (g->mangleFunctionsWithTarget)
|
||||
functionName += std::string("_") + g->target->GetISAString();
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
|
||||
#if 0
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(function);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
#endif
|
||||
}
|
||||
llvm::Function *appFunction =
|
||||
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
|
||||
#if defined(LLVM_3_1)
|
||||
@@ -538,6 +565,16 @@ Function::GenerateIR() {
|
||||
FATAL("Function verificication failed");
|
||||
}
|
||||
}
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
llvm::NamedMDNode* annotations =
|
||||
m->module->getOrInsertNamedMetadata("nvvm.annotations");
|
||||
llvm::SmallVector<llvm::Value*, 3> av;
|
||||
av.push_back(appFunction);
|
||||
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
|
||||
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
|
||||
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
27
ispc.cpp
27
ispc.cpp
@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
arch = "arm";
|
||||
else
|
||||
#endif
|
||||
if(!strncmp(isa, "nvptx", 5))
|
||||
arch = "nvptx64";
|
||||
else
|
||||
arch = "x86-64";
|
||||
}
|
||||
|
||||
@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_maskBitCount = 32;
|
||||
}
|
||||
#endif
|
||||
else if (!strcasecmp(isa, "nvptx"))
|
||||
{
|
||||
this->m_isa = Target::NVPTX;
|
||||
this->m_cpu = "sm_35";
|
||||
this->m_nativeVectorWidth = 32;
|
||||
this->m_nativeVectorAlignment = 32;
|
||||
this->m_vectorWidth = 1;
|
||||
this->m_hasHalf = true;
|
||||
this->m_maskingIsFree = true;
|
||||
this->m_maskBitCount = 1;
|
||||
this->m_hasTranscendentals = false;
|
||||
this->m_hasGather = this->m_hasScatter = false;
|
||||
}
|
||||
else {
|
||||
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
|
||||
isa, SupportedTargets());
|
||||
@@ -784,7 +800,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
// Initialize target-specific "target-feature" attribute.
|
||||
if (!m_attributes.empty()) {
|
||||
llvm::AttrBuilder attrBuilder;
|
||||
attrBuilder.addAttribute("target-cpu", this->m_cpu);
|
||||
if (m_isa != Target::NVPTX)
|
||||
attrBuilder.addAttribute("target-cpu", this->m_cpu);
|
||||
attrBuilder.addAttribute("target-features", this->m_attributes);
|
||||
this->m_tf_attributes = new llvm::AttributeSet(
|
||||
llvm::AttributeSet::get(
|
||||
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
|
||||
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
|
||||
"avx2-i32x8, avx2-i32x16, avx2-i64x4, "
|
||||
"generic-x1, generic-x4, generic-x8, generic-x16, "
|
||||
"generic-x32, generic-x64";
|
||||
"generic-x32, generic-x64, nvptx";
|
||||
}
|
||||
|
||||
|
||||
@@ -866,6 +883,8 @@ Target::GetTripleString() const {
|
||||
triple.setArchName("i386");
|
||||
else if (m_arch == "x86-64")
|
||||
triple.setArchName("x86_64");
|
||||
else if (m_arch == "nvptx64")
|
||||
triple = llvm::Triple("nvptx64", "nvidia", "cuda");
|
||||
else
|
||||
triple.setArchName(m_arch);
|
||||
}
|
||||
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
|
||||
return "avx2";
|
||||
case Target::GENERIC:
|
||||
return "generic";
|
||||
case Target::NVPTX:
|
||||
return "nvptx";
|
||||
default:
|
||||
FATAL("Unhandled target in ISAToString()");
|
||||
}
|
||||
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
|
||||
return "avx2-i32x8";
|
||||
case Target::GENERIC:
|
||||
return "generic-4";
|
||||
case Target::NVPTX:
|
||||
return "nvptx";
|
||||
default:
|
||||
FATAL("Unhandled target in ISAToTargetString()");
|
||||
}
|
||||
|
||||
3
ispc.h
3
ispc.h
@@ -179,7 +179,7 @@ public:
|
||||
flexible/performant of them will apear last in the enumerant. Note
|
||||
also that __best_available_isa() needs to be updated if ISAs are
|
||||
added or the enumerant values are reordered. */
|
||||
enum ISA {
|
||||
enum ISA { NVPTX,
|
||||
#ifdef ISPC_ARM_ENABLED
|
||||
NEON32, NEON16, NEON8,
|
||||
#endif
|
||||
@@ -606,6 +606,7 @@ struct Globals {
|
||||
/** Indicates that alignment in memory allocation routines should be
|
||||
forced to have given value. -1 value means natural alignment for the platforms. */
|
||||
int forceAlignment;
|
||||
std::string PtxString;
|
||||
};
|
||||
|
||||
enum {
|
||||
|
||||
5
main.cpp
5
main.cpp
@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
|
||||
LLVMInitializeARMTargetMC();
|
||||
#endif
|
||||
|
||||
LLVMInitializeNVPTXTargetInfo();
|
||||
LLVMInitializeNVPTXTarget();
|
||||
LLVMInitializeNVPTXAsmPrinter();
|
||||
LLVMInitializeNVPTXTargetMC();
|
||||
|
||||
char *file = NULL;
|
||||
const char *headerFileName = NULL;
|
||||
const char *outFileName = NULL;
|
||||
|
||||
143
module.cpp
143
module.cpp
@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
|
||||
return;
|
||||
}
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX &&
|
||||
#if 0
|
||||
!type->IsConstType() &&
|
||||
#endif
|
||||
#if 1
|
||||
at != NULL &&
|
||||
#endif
|
||||
type->IsVaryingType())
|
||||
{
|
||||
Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
|
||||
return;
|
||||
#if 0
|
||||
int nel = 32; /* warp-size */
|
||||
if (type->IsArrayType())
|
||||
{
|
||||
const ArrayType *at = CastType<ArrayType>(type);
|
||||
/* we must scale # elements by 4, because a thread-block will run 4 warps
|
||||
* or 128 threads.
|
||||
* ***note-to-me***:please define these value (128threads/4warps)
|
||||
* in nvptx-target definition
|
||||
* instead of compile-time constants
|
||||
*/
|
||||
nel *= at->GetElementCount();
|
||||
assert (!type->IsSOAType());
|
||||
type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
|
||||
}
|
||||
else
|
||||
type = new ArrayType(type->GetAsUniformType(), nel);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
llvm::Type *llvmType = type->LLVMType(g->ctx);
|
||||
if (llvmType == NULL)
|
||||
return;
|
||||
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
lCheckTaskParameterTypes(const Type *type, const std::string &name,
|
||||
SourcePos pos) {
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
return;
|
||||
if (lRecursiveCheckValidParamType(type, false) == false) {
|
||||
if (CastType<VectorType>(type))
|
||||
Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
|
||||
"function with \"nvptx\" target.", name.c_str());
|
||||
else
|
||||
Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
|
||||
name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Given a function type, loop through the function parameters and see if
|
||||
any are StructTypes. If so, issue an error; this is currently broken
|
||||
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
|
||||
#else // LLVM 3.1 and 3.3+
|
||||
function->addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
#endif
|
||||
if (functionType->isTask)
|
||||
/* evghenii: fails function verification when "if" executed in nvptx target */
|
||||
if (functionType->isTask && g->target->getISA() != Target::NVPTX)
|
||||
// This also applies transitively to members I think?
|
||||
#if defined(LLVM_3_1)
|
||||
function->setDoesNotAlias(1, true);
|
||||
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
|
||||
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
|
||||
Error(pos, "Task-qualified functions must have void return type.");
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX &&
|
||||
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
|
||||
functionType->isExported)
|
||||
{
|
||||
Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
|
||||
}
|
||||
|
||||
if (functionType->isExported || functionType->isExternC)
|
||||
lCheckForStructParameters(functionType, pos);
|
||||
|
||||
@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
|
||||
if (functionType->isExported) {
|
||||
lCheckExportedParameterTypes(argType, argName, argPos);
|
||||
}
|
||||
if (functionType->isTask) {
|
||||
lCheckTaskParameterTypes(argType, argName, argPos);
|
||||
}
|
||||
|
||||
// ISPC assumes that no pointers alias. (It should be possible to
|
||||
// specify when this is not the case, but this should be the
|
||||
@@ -959,7 +1017,13 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
|
||||
const char *fileType = NULL;
|
||||
switch (outputType) {
|
||||
case Asm:
|
||||
if (strcasecmp(suffix, "s"))
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
if (strcasecmp(suffix, "s"))
|
||||
fileType = "assembly";
|
||||
}
|
||||
else
|
||||
if (strcasecmp(suffix, "ptx"))
|
||||
fileType = "assembly";
|
||||
break;
|
||||
case Bitcode:
|
||||
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
|
||||
}
|
||||
|
||||
llvm::raw_fd_ostream fos(fd, (fd != 1), false);
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
|
||||
module->setDataLayout(dl_string);
|
||||
}
|
||||
llvm::WriteBitcodeToFile(module, fos);
|
||||
return true;
|
||||
}
|
||||
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
|
||||
opts.addMacroDef(g->cppArgs[i].substr(2));
|
||||
}
|
||||
}
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
opts.addMacroDef("__NVPTX__");
|
||||
opts.addMacroDef("programIndex=__programIndex()");
|
||||
opts.addMacroDef("cif=if");
|
||||
opts.addMacroDef("cfor=for");
|
||||
opts.addMacroDef("cwhile=while");
|
||||
opts.addMacroDef("ccontinue=continue");
|
||||
opts.addMacroDef("cdo=do");
|
||||
opts.addMacroDef("taskIndex0=__taskIndex0()");
|
||||
opts.addMacroDef("taskIndex1=__taskIndex1()");
|
||||
opts.addMacroDef("taskIndex2=__taskIndex2()");
|
||||
opts.addMacroDef("taskIndex=__taskIndex()");
|
||||
opts.addMacroDef("taskCount0=__taskCount0()");
|
||||
opts.addMacroDef("taskCount1=__taskCount1()");
|
||||
opts.addMacroDef("taskCount2=__taskCount2()");
|
||||
opts.addMacroDef("taskCount=__taskCount()");
|
||||
}
|
||||
|
||||
#if defined(LLVM_3_1)
|
||||
inst.getLangOpts().BCPLComment = 1;
|
||||
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
|
||||
return module;
|
||||
}
|
||||
|
||||
static std::string lCBEMangle(const std::string &S) {
|
||||
std::string Result;
|
||||
|
||||
for (unsigned i = 0, e = S.size(); i != e; ++i) {
|
||||
if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
|
||||
(S[i] == '<' && S[i+1] == '<'))) {
|
||||
Result += '_';
|
||||
Result += 'A'+(S[i]&15);
|
||||
Result += 'A'+((S[i]>>4)&15);
|
||||
Result += '_';
|
||||
i++;
|
||||
} else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
|
||||
Result += S[i];
|
||||
} else {
|
||||
Result += '_';
|
||||
Result += 'A'+(S[i]&15);
|
||||
Result += 'A'+((S[i]>>4)&15);
|
||||
Result += '_';
|
||||
}
|
||||
}
|
||||
return Result;
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
Module::CompileAndOutput(const char *srcFile,
|
||||
@@ -2555,7 +2665,7 @@ Module::CompileAndOutput(const char *srcFile,
|
||||
const char *hostStubFileName,
|
||||
const char *devStubFileName)
|
||||
{
|
||||
if (target == NULL || strchr(target, ',') == NULL) {
|
||||
if (target == NULL || strchr(target, ',') == NULL) {
|
||||
// We're only compiling to a single target
|
||||
g->target = new Target(arch, cpu, target, generatePIC);
|
||||
if (!g->target->isValid())
|
||||
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,
|
||||
|
||||
m = new Module(srcFile);
|
||||
if (m->CompileFile() == 0) {
|
||||
|
||||
/* NVPTX:
|
||||
* for PTX target replace '.' with '_' in all global variables
|
||||
* a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
|
||||
*/
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
/* mangle global variables names */
|
||||
{
|
||||
llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
|
||||
for (; I != E; I++)
|
||||
I->setName(lCBEMangle(I->getName()));
|
||||
}
|
||||
|
||||
/* mangle functions names */
|
||||
{
|
||||
llvm::Module::iterator I = m->module->begin(), E = m->module->end();
|
||||
for (; I != E; I++)
|
||||
{
|
||||
std::string str = I->getName();
|
||||
if (str.find("operator") != std::string::npos)
|
||||
I->setName(lCBEMangle(str));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (outputType == CXX) {
|
||||
if (target == NULL || strncmp(target, "generic-", 8) != 0) {
|
||||
Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
|
||||
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,
|
||||
|
||||
return errorCount > 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
20
nvptxcc
Executable file
20
nvptxcc
Executable file
@@ -0,0 +1,20 @@
|
||||
#!/bin/sh
|
||||
|
||||
PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
|
||||
PTXCC=ptxcc
|
||||
ARGS=${@:2}
|
||||
if [ "$NVVM" == "1" ];
|
||||
then
|
||||
LLVM32=$HOME/usr/local/llvm/bin-3.2
|
||||
LLVMDIS=$LLVM32/bin/llvm-dis
|
||||
PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
|
||||
$($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
|
||||
$($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
|
||||
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
|
||||
else
|
||||
$($PTXCC $1 -o $1.o -Xnvcc="-G") && \
|
||||
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
|
||||
fi
|
||||
|
||||
|
||||
|
||||
178
opt.cpp
178
opt.cpp
@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
|
||||
static llvm::Pass *CreateReplaceStdlibShiftPass();
|
||||
|
||||
static llvm::Pass *CreateFixBooleanSelectPass();
|
||||
static llvm::Pass *CreatePromoteLocalToPrivatePass();
|
||||
|
||||
#define DEBUG_START_PASS(NAME) \
|
||||
if (g->debugPrint && \
|
||||
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
// run absolutely no optimizations, since the front-end needs us to
|
||||
// take the various __pseudo_* functions it has emitted and turn
|
||||
// them into something that can actually execute.
|
||||
optPM.add(CreateImproveMemoryOpsPass(), 100);
|
||||
|
||||
if (g->opt.disableGatherScatterOptimizations == false &&
|
||||
g->target->getVectorWidth() > 1)
|
||||
optPM.add(CreateImproveMemoryOpsPass(), 100);
|
||||
|
||||
if (g->opt.disableHandlePseudoMemoryOps == false)
|
||||
optPM.add(CreateReplacePseudoMemoryOpsPass());
|
||||
|
||||
@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
llvm::initializeInstrumentation(*registry);
|
||||
llvm::initializeTarget(*registry);
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
optPM.add(CreatePromoteLocalToPrivatePass());
|
||||
optPM.add(llvm::createGlobalDCEPass(), 185);
|
||||
|
||||
// Setup to use LLVM default AliasAnalysis
|
||||
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
optPM.add(llvm::createGlobalOptimizerPass());
|
||||
optPM.add(llvm::createReassociatePass());
|
||||
optPM.add(llvm::createIPConstantPropagationPass());
|
||||
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
optPM.add(CreateReplaceStdlibShiftPass(),229);
|
||||
|
||||
optPM.add(llvm::createDeadArgEliminationPass(),230);
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {
|
||||
|
||||
// Should be the last
|
||||
optPM.add(CreateFixBooleanSelectPass(), 400);
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
optPM.add(llvm::createGlobalDCEPass());
|
||||
|
||||
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
|
||||
optPM.add(llvm::createBasicAliasAnalysisPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
// Here clang has an experimental pass SROAPass instead of
|
||||
// ScalarReplAggregatesPass. We should add it in the future.
|
||||
optPM.add(llvm::createScalarReplAggregatesPass());
|
||||
optPM.add(llvm::createEarlyCSEPass());
|
||||
optPM.add(llvm::createLowerExpectIntrinsicPass());
|
||||
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
|
||||
optPM.add(llvm::createBasicAliasAnalysisPass());
|
||||
|
||||
// Early optimizations to try to reduce the total amount of code to
|
||||
// work with if we can
|
||||
optPM.add(llvm::createReassociatePass());
|
||||
optPM.add(llvm::createConstantPropagationPass());
|
||||
optPM.add(llvm::createDeadInstEliminationPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
|
||||
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||
optPM.add(llvm::createAggressiveDCEPass());
|
||||
|
||||
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createDeadInstEliminationPass());
|
||||
|
||||
// On to more serious optimizations
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createPromoteMemoryToRegisterPass());
|
||||
optPM.add(llvm::createGlobalOptimizerPass());
|
||||
optPM.add(llvm::createReassociatePass());
|
||||
optPM.add(llvm::createIPConstantPropagationPass());
|
||||
|
||||
optPM.add(llvm::createDeadArgEliminationPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createPruneEHPass());
|
||||
optPM.add(llvm::createFunctionAttrsPass());
|
||||
optPM.add(llvm::createFunctionInliningPass());
|
||||
optPM.add(llvm::createConstantPropagationPass());
|
||||
optPM.add(llvm::createDeadInstEliminationPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
|
||||
optPM.add(llvm::createArgumentPromotionPass());
|
||||
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
|
||||
// Starting from 3.4 this functionality was moved to
|
||||
// InstructionCombiningPass. See r184459 for details.
|
||||
optPM.add(llvm::createSimplifyLibCallsPass());
|
||||
#endif
|
||||
optPM.add(llvm::createAggressiveDCEPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createJumpThreadingPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createTailCallEliminationPass());
|
||||
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
|
||||
optPM.add(llvm::createFunctionInliningPass());
|
||||
optPM.add(llvm::createConstantPropagationPass());
|
||||
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
|
||||
optPM.add(llvm::createIPSCCPPass());
|
||||
optPM.add(llvm::createDeadArgEliminationPass());
|
||||
optPM.add(llvm::createAggressiveDCEPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
|
||||
optPM.add(llvm::createFunctionInliningPass());
|
||||
optPM.add(llvm::createArgumentPromotionPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createReassociatePass());
|
||||
optPM.add(llvm::createLoopRotatePass());
|
||||
optPM.add(llvm::createLICMPass());
|
||||
// optPM.add(llvm::createLoopUnswitchPass(false));
|
||||
#if 1
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createIndVarSimplifyPass());
|
||||
optPM.add(llvm::createLoopIdiomPass());
|
||||
optPM.add(llvm::createLoopDeletionPass());
|
||||
optPM.add(llvm::createLoopUnrollPass());
|
||||
optPM.add(llvm::createGVNPass());
|
||||
optPM.add(llvm::createMemCpyOptPass());
|
||||
optPM.add(llvm::createSCCPPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createJumpThreadingPass());
|
||||
optPM.add(llvm::createCorrelatedValuePropagationPass());
|
||||
optPM.add(llvm::createDeadStoreEliminationPass());
|
||||
optPM.add(llvm::createAggressiveDCEPass());
|
||||
optPM.add(llvm::createCFGSimplificationPass());
|
||||
optPM.add(llvm::createInstructionCombiningPass());
|
||||
optPM.add(llvm::createFunctionInliningPass());
|
||||
optPM.add(llvm::createAggressiveDCEPass());
|
||||
optPM.add(llvm::createStripDeadPrototypesPass());
|
||||
optPM.add(llvm::createGlobalDCEPass());
|
||||
optPM.add(llvm::createConstantMergePass());
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
// Finish up by making sure we didn't mess anything up in the IR along
|
||||
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
|
||||
return new FixBooleanSelectPass();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Detect addrspace(3)
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
|
||||
{
|
||||
public:
|
||||
static char ID; // Pass identification, replacement for typeid
|
||||
PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
|
||||
|
||||
bool runOnBasicBlock(llvm::BasicBlock &BB);
|
||||
};
|
||||
|
||||
char PromoteLocalToPrivatePass::ID = 0;
|
||||
|
||||
bool
|
||||
PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
|
||||
{
|
||||
std::vector<llvm::AllocaInst*> Allocas;
|
||||
|
||||
bool modifiedAny = false;
|
||||
|
||||
llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
|
||||
|
||||
// Find allocas that are safe to promote, by looking at all instructions in
|
||||
// the entry node
|
||||
for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
|
||||
{
|
||||
llvm::Instruction *inst = &*I;
|
||||
if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
|
||||
{
|
||||
llvm::Function *func = ci->getCalledFunction();
|
||||
if (cvtFunc && (cvtFunc == func))
|
||||
{
|
||||
#if 0
|
||||
fprintf(stderr , "--found cvt-- name= %s \n",
|
||||
I->getName().str().c_str());
|
||||
#endif
|
||||
llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
|
||||
assert(alloca != NULL);
|
||||
#if 0
|
||||
const int align = 8; // g->target->getNativeVectorAlignment();
|
||||
alloca->setAlignment(align);
|
||||
#endif
|
||||
ci->replaceAllUsesWith(alloca);
|
||||
modifiedAny = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return modifiedAny;
|
||||
}
|
||||
|
||||
static llvm::Pass *
|
||||
CreatePromoteLocalToPrivatePass() {
|
||||
return new PromoteLocalToPrivatePass();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
14
ptxtestcc.sh
Executable file
14
ptxtestcc.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/bin/sh
|
||||
LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
|
||||
DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
|
||||
|
||||
ISPC=ispc
|
||||
PTXCC=ptxcc
|
||||
PTXGEN=~/ptxgen
|
||||
$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
|
||||
#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
|
||||
$($PTXCC $1.ptx -Xptxas=-v -o $1.ptx.o) && \
|
||||
nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2
|
||||
|
||||
|
||||
|
||||
37
run_tests.py
37
run_tests.py
@@ -204,6 +204,8 @@ def run_test(testname):
|
||||
return (1, 0)
|
||||
else:
|
||||
global is_generic_target
|
||||
global is_nvptx_target
|
||||
global is_nvptx_nvvm
|
||||
if is_windows:
|
||||
if is_generic_target:
|
||||
obj_name = "%s.cpp" % os.path.basename(filename)
|
||||
@@ -218,6 +220,13 @@ def run_test(testname):
|
||||
else:
|
||||
if is_generic_target:
|
||||
obj_name = "%s.cpp" % testname
|
||||
elif is_nvptx_target:
|
||||
if os.environ.get("NVVM") == "1":
|
||||
is_nvptx_nvvm = True
|
||||
obj_name = "%s.bc" % testname
|
||||
else:
|
||||
obj_name = "%s.ptx" % testname
|
||||
is_nvptx_nvvm = False
|
||||
else:
|
||||
obj_name = "%s.o" % testname
|
||||
exe_name = "%s.run" % testname
|
||||
@@ -248,13 +257,32 @@ def run_test(testname):
|
||||
cc_cmd += ' -Wl,-no_pie'
|
||||
if should_fail:
|
||||
cc_cmd += " -DEXPECT_FAILURE"
|
||||
if is_nvptx_target:
|
||||
nvptxcc_exe = "nvptxcc"
|
||||
nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
|
||||
cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
|
||||
(nvptxcc_exe_rel, obj_name, match, exe_name)
|
||||
|
||||
ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
|
||||
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
|
||||
(filename, obj_name, options.arch, options.target)
|
||||
if options.no_opt:
|
||||
ispc_cmd += " -O0"
|
||||
if is_generic_target:
|
||||
ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
|
||||
if is_nvptx_target:
|
||||
filename4ptx = filename+".ptx.parsed_ispc"
|
||||
grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
|
||||
(filename, filename4ptx)
|
||||
if options.verbose:
|
||||
print "Grepping: %s" % grep_cmd
|
||||
sp = subprocess.Popen(grep_cmd, shell=True)
|
||||
sp.communicate()
|
||||
if is_nvptx_nvvm:
|
||||
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
|
||||
(filename4ptx, obj_name, options.target)
|
||||
else:
|
||||
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
|
||||
(filename4ptx, obj_name, options.target)
|
||||
|
||||
# compile the ispc code, make the executable, and run it...
|
||||
(compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd],
|
||||
@@ -269,7 +297,7 @@ def run_test(testname):
|
||||
basename = os.path.basename(filename)
|
||||
os.unlink("%s.pdb" % basename)
|
||||
os.unlink("%s.ilk" % basename)
|
||||
os.unlink(obj_name)
|
||||
# os.unlink(obj_name)
|
||||
except:
|
||||
None
|
||||
|
||||
@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
|
||||
ispc_exe = glob_var[3]
|
||||
global is_generic_target
|
||||
is_generic_target = glob_var[4]
|
||||
global is_nvptx_target
|
||||
global run_tests_log
|
||||
run_tests_log = glob_var[5]
|
||||
|
||||
@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
|
||||
|
||||
if options.target == 'neon':
|
||||
options.arch = 'arm'
|
||||
if options.target == "nvptx":
|
||||
options.arch = "nvptx64"
|
||||
|
||||
# use relative path to not depend on host directory, which may possibly
|
||||
# have white spaces and unicode characters.
|
||||
@@ -531,8 +562,10 @@ def run_tests(options1, args, print_version):
|
||||
ispc_exe += " " + options.ispc_flags
|
||||
|
||||
global is_generic_target
|
||||
global is_nvptx_target
|
||||
is_generic_target = (options.target.find("generic-") != -1 and
|
||||
options.target != "generic-1" and options.target != "generic-x1")
|
||||
is_nvptx_target = (options.target.find("nvptx") != -1)
|
||||
if is_generic_target and options.include_file == None:
|
||||
if options.target == "generic-4" or options.target == "generic-x4":
|
||||
error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)
|
||||
|
||||
159
stdlib.ispc
159
stdlib.ispc
@@ -57,6 +57,31 @@
|
||||
#error Unknown value of ISPC_MASK_BITS
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// CUDA Specific primitives
|
||||
//
|
||||
/***************/
|
||||
|
||||
__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
|
||||
__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
|
||||
__declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
|
||||
|
||||
/***************/
|
||||
|
||||
__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
|
||||
|
||||
/***************/
|
||||
|
||||
__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
|
||||
__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Low level primitives
|
||||
|
||||
@@ -464,7 +489,10 @@ __declspec(safe)
|
||||
static inline uniform int popcnt(bool v) {
|
||||
// As with any() and all(), only count across the active lanes
|
||||
#if (ISPC_MASK_BITS == 1)
|
||||
return __popcnt_int64(__movmsk(v & __mask));
|
||||
if (__is_nvptx_target)
|
||||
return __popcnt_int64(__movmsk_ptx(v & __mask));
|
||||
else
|
||||
return __popcnt_int64(__movmsk(v & __mask));
|
||||
#else
|
||||
return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
|
||||
#endif
|
||||
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
|
||||
return __packed_store_active(a, vals, (IntMaskType)__mask);
|
||||
}
|
||||
|
||||
static inline uniform int
|
||||
packed_store_active(bool active, uniform int a[], int vals) {
|
||||
return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
|
||||
}
|
||||
|
||||
static inline uniform int
|
||||
packed_store_active2(uniform int a[], int vals) {
|
||||
return __packed_store_active2(a, vals, (IntMaskType)__mask);
|
||||
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
|
||||
// System information
|
||||
|
||||
static inline uniform int num_cores() {
|
||||
if (__is_nvptx_target)
|
||||
return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
|
||||
else
|
||||
return __num_cores();
|
||||
}
|
||||
|
||||
@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
|
||||
__memory_barrier();
|
||||
}
|
||||
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
|
||||
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1804,10 +1844,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB) \
|
||||
#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \
|
||||
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform int i = 0; \
|
||||
TA ret[programCount]; \
|
||||
TA memVal; \
|
||||
@@ -1838,6 +1883,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
|
||||
originally got back from memory... */ \
|
||||
ret[lastSwap] = memVal; \
|
||||
return ret[programIndex]; \
|
||||
}\
|
||||
} \
|
||||
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||
uniform TA value) { \
|
||||
@@ -1845,6 +1891,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
|
||||
return ret; \
|
||||
} \
|
||||
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1855,9 +1905,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
}\
|
||||
} \
|
||||
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
|
||||
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
|
||||
uniform TA oneval = reduce_##OPA(value); \
|
||||
TA ret; \
|
||||
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
|
||||
} \
|
||||
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
TA value) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
}
|
||||
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int32,int32)
|
||||
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(float,float)
|
||||
DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
|
||||
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(int64,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
|
||||
DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
|
||||
|
||||
// For everything but atomic min and max, we can use the same
|
||||
// implementations for unsigned as for signed.
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
|
||||
DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
|
||||
|
||||
DEFINE_ATOMIC_SWAP(double,double)
|
||||
DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
|
||||
|
||||
#undef DEFINE_ATOMIC_OP
|
||||
#undef DEFINE_ATOMIC_MINMAX_OP
|
||||
#undef DEFINE_ATOMIC_SWAP
|
||||
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
|
||||
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \
|
||||
static inline uniform TA atomic_compare_exchange_global( \
|
||||
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
|
||||
uniform TA ret = \
|
||||
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global( \
|
||||
} \
|
||||
static inline TA atomic_compare_exchange_global( \
|
||||
uniform TA * varying ptr, TA oldval, TA newval) { \
|
||||
if (__is_nvptx_target) { \
|
||||
TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \
|
||||
return ret; \
|
||||
} else { \
|
||||
uniform TA * uniform ptrArray[programCount]; \
|
||||
ptrArray[programIndex] = ptr; \
|
||||
TA ret; \
|
||||
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global( \
|
||||
ret = insert(ret, i, r); \
|
||||
} \
|
||||
return ret; \
|
||||
} \
|
||||
}
|
||||
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
|
||||
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
|
||||
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
|
||||
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
|
||||
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
|
||||
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
|
||||
|
||||
#undef ATOMIC_DECL_CMPXCHG
|
||||
|
||||
@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
|
||||
} \
|
||||
static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
|
||||
TYPE ret; \
|
||||
if (__is_nvptx_target) { \
|
||||
foreach_active (i) { \
|
||||
uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
|
||||
ret = insert(ret, i, *ptr); \
|
||||
*ptr = OPFUNC(*ptr, extract(value, i)); \
|
||||
} \
|
||||
} else { \
|
||||
uniform TYPE * uniform ptrs[programCount]; \
|
||||
ptrs[programIndex] = p; \
|
||||
foreach_active (i) { \
|
||||
ret = insert(ret, i, *ptrs[i]); \
|
||||
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
|
||||
} \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
|
||||
|
||||
727
stmt.cpp
727
stmt.cpp
@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
|
||||
return lHasUnsizedArrays(at->GetElementType());
|
||||
}
|
||||
|
||||
static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos ¤tPos, const bool variable = false)
|
||||
{
|
||||
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX)
|
||||
return value;
|
||||
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
|
||||
const int addressSpace = pt->getAddressSpace();
|
||||
if (addressSpace != 3 && addressSpace != 4)
|
||||
return value;
|
||||
|
||||
llvm::Type *elTy = pt->getElementType();
|
||||
|
||||
/* convert elTy addrspace(3)* to i64* addrspace(3)* */
|
||||
llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
|
||||
value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
|
||||
|
||||
/* convert i64* addrspace(3) to i64* */
|
||||
llvm::Function *__cvt2gen = m->module->getFunction(
|
||||
addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
|
||||
|
||||
std::vector<llvm::Value *> __cvt2gen_args;
|
||||
__cvt2gen_args.push_back(value);
|
||||
value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
|
||||
|
||||
/* compute offset */
|
||||
if (addressSpace == 3)
|
||||
{
|
||||
assert(elTy->isArrayTy());
|
||||
const int numElTot = elTy->getArrayNumElements();
|
||||
const int numEl = numElTot/4;
|
||||
#if 0
|
||||
fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
|
||||
#endif
|
||||
llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
|
||||
assert(arrTy != NULL);
|
||||
llvm::Type *arrElTy = arrTy->getElementType();
|
||||
#if 0
|
||||
if (arrElTy->isArrayTy())
|
||||
Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
|
||||
#endif
|
||||
|
||||
/* convert i64* to errElTy* */
|
||||
llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
|
||||
value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
|
||||
|
||||
llvm::Function *func_warp_index = m->module->getFunction("__warp_index");
|
||||
llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(), "gep2gen_warp_index");
|
||||
llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
|
||||
value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
|
||||
}
|
||||
|
||||
/* convert arrElTy* to elTy* */
|
||||
llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
|
||||
value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
void
|
||||
DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (sym->storageClass == SC_STATIC) {
|
||||
|
||||
if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
|
||||
PerformanceWarning(sym->pos,
|
||||
"Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
|
||||
sym->name.c_str());
|
||||
if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
|
||||
PerformanceWarning(sym->pos,
|
||||
"\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
|
||||
sym->name.c_str());
|
||||
if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
|
||||
PerformanceWarning(sym->pos,
|
||||
"\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
|
||||
sym->name.c_str());
|
||||
|
||||
// For static variables, we need a compile-time constant value
|
||||
// for its initializer; if there's no initializer, we use a
|
||||
// zero value.
|
||||
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (cinit == NULL)
|
||||
cinit = llvm::Constant::getNullValue(llvmType);
|
||||
|
||||
int addressSpace = 0;
|
||||
if (g->target->getISA() == Target::NVPTX &&
|
||||
sym->type->IsConstType() &&
|
||||
sym->type->IsUniformType())
|
||||
addressSpace = 4;
|
||||
|
||||
// Allocate space for the static variable in global scope, so
|
||||
// that it persists across function calls
|
||||
sym->storagePtr =
|
||||
new llvm::GlobalVariable(*m->module, llvmType,
|
||||
sym->type->IsConstType(),
|
||||
llvm::GlobalValue::InternalLinkage, cinit,
|
||||
llvm::Twine("static.") +
|
||||
llvm::Twine("static_") +
|
||||
llvm::Twine(sym->pos.first_line) +
|
||||
llvm::Twine(".") + sym->name.c_str());
|
||||
llvm::Twine("_") + sym->name.c_str(),
|
||||
NULL,
|
||||
llvm::GlobalVariable::NotThreadLocal,
|
||||
addressSpace);
|
||||
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
|
||||
// Tell the FunctionEmitContext about the variable
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
}
|
||||
else {
|
||||
else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
|
||||
/* NVPTX:
|
||||
* only non-constant uniform data types are stored in shared memory
|
||||
* constant uniform are automatically promoted to varying
|
||||
*/
|
||||
!sym->type->IsConstType() &&
|
||||
#if 1
|
||||
sym->type->IsArrayType() &&
|
||||
#endif
|
||||
g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
PerformanceWarning(sym->pos,
|
||||
"Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
|
||||
"Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
|
||||
|
||||
/* with __shared__ memory everything must be an array */
|
||||
int nel = 4;
|
||||
ArrayType *nat;
|
||||
bool variable = true;
|
||||
if (sym->type->IsArrayType())
|
||||
{
|
||||
const ArrayType *at = CastType<ArrayType>(sym->type);
|
||||
/* we must scale # elements by 4, because a thread-block will run 4 warps
|
||||
* or 128 threads.
|
||||
* ***note-to-me***:please define these value (128threads/4warps)
|
||||
* in nvptx-target definition
|
||||
* instead of compile-time constants
|
||||
*/
|
||||
nel *= at->GetElementCount();
|
||||
if (sym->type->IsSOAType())
|
||||
nel *= sym->type->GetSOAWidth();
|
||||
nat = new ArrayType(at->GetElementType(), nel);
|
||||
variable = false;
|
||||
}
|
||||
else
|
||||
nat = new ArrayType(sym->type, nel);
|
||||
|
||||
llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
|
||||
llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
|
||||
|
||||
sym->storagePtr =
|
||||
new llvm::GlobalVariable(*m->module, llvmTypeUn,
|
||||
sym->type->IsConstType(),
|
||||
llvm::GlobalValue::InternalLinkage,
|
||||
cinit,
|
||||
llvm::Twine("local_") +
|
||||
llvm::Twine(sym->pos.first_line) +
|
||||
llvm::Twine("_") + sym->name.c_str(),
|
||||
NULL,
|
||||
llvm::GlobalVariable::NotThreadLocal,
|
||||
/*AddressSpace=*/3);
|
||||
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
|
||||
llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
|
||||
sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
|
||||
|
||||
// Tell the FunctionEmitContext about the variable; must do
|
||||
// this before the initializer stuff.
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
|
||||
if (initExpr == 0 && sym->type->IsConstType())
|
||||
Error(sym->pos, "Missing initializer for const variable "
|
||||
"\"%s\".", sym->name.c_str());
|
||||
|
||||
// And then get it initialized...
|
||||
sym->parentFunction = ctx->GetFunction();
|
||||
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For non-static variables, allocate storage on the stack
|
||||
sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
|
||||
|
||||
@@ -253,10 +402,14 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// this before the initializer stuff.
|
||||
ctx->EmitVariableDebugInfo(sym);
|
||||
|
||||
if (initExpr == 0 && sym->type->IsConstType())
|
||||
Error(sym->pos, "Missing initializer for const variable "
|
||||
"\"%s\".", sym->name.c_str());
|
||||
|
||||
// And then get it initialized...
|
||||
sym->parentFunction = ctx->GetFunction();
|
||||
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
if (testValue == NULL)
|
||||
return;
|
||||
|
||||
|
||||
#if 0
|
||||
if (!isUniform && g->target->getISA() == Target::NVPTX)
|
||||
{
|
||||
/* With "nvptx" target, SIMT hardware takes care of non-uniform
|
||||
* control flow. We trick ISPC to generate uniform control flow.
|
||||
*/
|
||||
testValue = ctx->ExtractInst(testValue, 0);
|
||||
isUniform = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
if (isUniform) {
|
||||
ctx->StartUniformIf();
|
||||
if (doAllCheck)
|
||||
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||
|
||||
// Do any of the program instances want to run the 'true'
|
||||
// block? If not, jump ahead to bNext.
|
||||
#if 1
|
||||
llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
|
||||
#else
|
||||
llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
|
||||
#endif
|
||||
ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
|
||||
|
||||
// Emit statements for true
|
||||
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
|
||||
|
||||
// Similarly, check to see if any of the instances want to
|
||||
// run the 'false' block...
|
||||
#if 1
|
||||
llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
|
||||
#else
|
||||
llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
|
||||
#endif
|
||||
ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
|
||||
|
||||
// Emit code for false
|
||||
@@ -1273,7 +1447,10 @@ static llvm::Value *
|
||||
lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
||||
llvm::Value *uniformCounterPtr,
|
||||
llvm::Value *varyingCounterPtr,
|
||||
const std::vector<int> &spans) {
|
||||
const std::vector<int> &spans)
|
||||
{
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
// Smear the uniform counter value out to be varying
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
|
||||
llvm::Value *smearCounter = ctx->BroadcastValue(
|
||||
@@ -1306,6 +1483,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
|
||||
LLVMInt32Vector(delta), "iter_val");
|
||||
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
||||
return varyingCounter;
|
||||
}
|
||||
else /* NVPTX == true */
|
||||
{
|
||||
// Smear the uniform counter value out to be varying
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
|
||||
llvm::Value *smearCounter = ctx->BroadcastValue(
|
||||
counter, LLVMTypes::Int32VectorType, "smear_counter");
|
||||
|
||||
// Figure out the offsets; this is a little bit tricky. As an example,
|
||||
// consider a 2D tiled foreach loop, where we're running 8-wide and
|
||||
// where the inner dimension has a stride of 4 and the outer dimension
|
||||
// has a stride of 2. For the inner dimension, we want the offsets
|
||||
// (0,1,2,3,0,1,2,3), and for the outer dimension we want
|
||||
// (0,0,0,0,1,1,1,1).
|
||||
int32_t delta[ISPC_MAX_NVEC];
|
||||
const int vecWidth = 32;
|
||||
std::vector<llvm::Constant*> constDeltaList;
|
||||
for (int i = 0; i < vecWidth; ++i)
|
||||
{
|
||||
int d = i;
|
||||
// First, account for the effect of any dimensions at deeper
|
||||
// nesting levels than the current one.
|
||||
int prevDimSpanCount = 1;
|
||||
for (int j = dim; j < nDims-1; ++j)
|
||||
prevDimSpanCount *= spans[j+1];
|
||||
d /= prevDimSpanCount;
|
||||
|
||||
// And now with what's left, figure out our own offset
|
||||
delta[i] = d % spans[dim];
|
||||
constDeltaList.push_back(LLVMInt8(delta[i]));
|
||||
}
|
||||
|
||||
llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
|
||||
// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
|
||||
|
||||
|
||||
llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
|
||||
/*Module=*/*m->module,
|
||||
/*Type=*/ArrayDelta,
|
||||
/*isConstant=*/true,
|
||||
/*Linkage=*/llvm::GlobalValue::PrivateLinkage,
|
||||
/*Initializer=*/0, // has initializer, specified below
|
||||
/*Name=*/"constDeltaForeach");
|
||||
#if 0
|
||||
/*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
|
||||
/*unsigned AddressSpace=*/4 /*constant*/);
|
||||
#endif
|
||||
|
||||
|
||||
llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
|
||||
|
||||
globalDelta->setInitializer(constDelta);
|
||||
llvm::Function *func_program_index = m->module->getFunction("__program_index");
|
||||
llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
|
||||
|
||||
std::vector<llvm::Value*> ptr_arrayidx_indices;
|
||||
ptr_arrayidx_indices.push_back(LLVMInt32(0));
|
||||
ptr_arrayidx_indices.push_back(laneIdx);
|
||||
#if 1
|
||||
llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
|
||||
llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
|
||||
llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
|
||||
|
||||
llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
|
||||
llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
|
||||
|
||||
llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
|
||||
// llvm::UndefValue(LLVMInt32Vector),
|
||||
const_packed_41,
|
||||
int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
|
||||
#endif
|
||||
|
||||
|
||||
// Add the deltas to compute the varying counter values; store the
|
||||
// result to memory and then return it directly as well.
|
||||
#if 0
|
||||
llvm::Value *varyingCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||
LLVMInt32Vector(delta), "iter_val");
|
||||
#else
|
||||
llvm::Value *varyingCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
|
||||
packed_43, "iter_val");
|
||||
#endif
|
||||
ctx->StoreInst(varyingCounter, varyingCounterPtr);
|
||||
return varyingCounter;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1383,7 +1647,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
|
||||
// This should be caught during typechecking
|
||||
AssertPos(pos, startExprs.size() == dimVariables.size() &&
|
||||
endExprs.size() == dimVariables.size());
|
||||
endExprs.size() == dimVariables.size());
|
||||
int nDims = (int)dimVariables.size();
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
@@ -1394,64 +1658,66 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
|
||||
|
||||
std::vector<int> span(nDims, 0);
|
||||
lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
|
||||
const int vectorWidth =
|
||||
g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
|
||||
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
|
||||
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
// Basic blocks that we'll fill in later with the looping logic for
|
||||
// this dimension.
|
||||
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
|
||||
if (i < nDims-1)
|
||||
// stepping for the innermost dimension is handled specially
|
||||
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
|
||||
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
|
||||
// Basic blocks that we'll fill in later with the looping logic for
|
||||
// this dimension.
|
||||
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
|
||||
if (i < nDims-1)
|
||||
// stepping for the innermost dimension is handled specially
|
||||
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
|
||||
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
|
||||
|
||||
// Start and end value for this loop dimension
|
||||
llvm::Value *sv = startExprs[i]->GetValue(ctx);
|
||||
llvm::Value *ev = endExprs[i]->GetValue(ctx);
|
||||
if (sv == NULL || ev == NULL)
|
||||
return;
|
||||
startVals.push_back(sv);
|
||||
endVals.push_back(ev);
|
||||
// Start and end value for this loop dimension
|
||||
llvm::Value *sv = startExprs[i]->GetValue(ctx);
|
||||
llvm::Value *ev = endExprs[i]->GetValue(ctx);
|
||||
if (sv == NULL || ev == NULL)
|
||||
return;
|
||||
startVals.push_back(sv);
|
||||
endVals.push_back(ev);
|
||||
|
||||
// nItems = endVal - startVal
|
||||
llvm::Value *nItems =
|
||||
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
|
||||
// nItems = endVal - startVal
|
||||
llvm::Value *nItems =
|
||||
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
|
||||
|
||||
// nExtras = nItems % (span for this dimension)
|
||||
// This gives us the number of extra elements we need to deal with
|
||||
// at the end of the loop for this dimension that don't fit cleanly
|
||||
// into a vector width.
|
||||
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
|
||||
LLVMInt32(span[i]), "nextras"));
|
||||
// nExtras = nItems % (span for this dimension)
|
||||
// This gives us the number of extra elements we need to deal with
|
||||
// at the end of the loop for this dimension that don't fit cleanly
|
||||
// into a vector width.
|
||||
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
|
||||
LLVMInt32(span[i]), "nextras"));
|
||||
|
||||
// alignedEnd = endVal - nExtras
|
||||
alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
|
||||
nExtras[i], "aligned_end"));
|
||||
// alignedEnd = endVal - nExtras
|
||||
alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
|
||||
nExtras[i], "aligned_end"));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Each dimension has a loop counter that is a uniform value that
|
||||
// goes from startVal to endVal, in steps of the span for this
|
||||
// dimension. Its value is only used internally here for looping
|
||||
// logic and isn't directly available in the user's program code.
|
||||
uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
|
||||
"counter"));
|
||||
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// Each dimension has a loop counter that is a uniform value that
|
||||
// goes from startVal to endVal, in steps of the span for this
|
||||
// dimension. Its value is only used internally here for looping
|
||||
// logic and isn't directly available in the user's program code.
|
||||
uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
|
||||
"counter"));
|
||||
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
|
||||
|
||||
// There is also a varying variable that holds the set of index
|
||||
// values for each dimension in the current loop iteration; this is
|
||||
// the value that is program-visible.
|
||||
dimVariables[i]->storagePtr =
|
||||
ctx->AllocaInst(LLVMTypes::Int32VectorType,
|
||||
dimVariables[i]->name.c_str());
|
||||
dimVariables[i]->parentFunction = ctx->GetFunction();
|
||||
ctx->EmitVariableDebugInfo(dimVariables[i]);
|
||||
// There is also a varying variable that holds the set of index
|
||||
// values for each dimension in the current loop iteration; this is
|
||||
// the value that is program-visible.
|
||||
dimVariables[i]->storagePtr =
|
||||
ctx->AllocaInst(LLVMTypes::Int32VectorType,
|
||||
dimVariables[i]->name.c_str());
|
||||
dimVariables[i]->parentFunction = ctx->GetFunction();
|
||||
ctx->EmitVariableDebugInfo(dimVariables[i]);
|
||||
|
||||
// Each dimension also maintains a mask that represents which of
|
||||
// the varying elements in the current iteration should be
|
||||
// processed. (i.e. this is used to disable the lanes that have
|
||||
// out-of-bounds offsets.)
|
||||
extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
|
||||
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
|
||||
// Each dimension also maintains a mask that represents which of
|
||||
// the varying elements in the current iteration should be
|
||||
// processed. (i.e. this is used to disable the lanes that have
|
||||
// out-of-bounds offsets.)
|
||||
extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
|
||||
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
|
||||
}
|
||||
|
||||
ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
|
||||
@@ -1464,14 +1730,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// a given dimension in preparation for running through its loop again,
|
||||
// after the enclosing level advances its counter.
|
||||
for (int i = 0; i < nDims; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbReset[i]);
|
||||
if (i == 0)
|
||||
ctx->BranchInst(bbExit);
|
||||
else {
|
||||
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
|
||||
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbStep[i-1]);
|
||||
}
|
||||
ctx->SetCurrentBasicBlock(bbReset[i]);
|
||||
if (i == 0)
|
||||
ctx->BranchInst(bbExit);
|
||||
else {
|
||||
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
|
||||
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbStep[i-1]);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1481,67 +1747,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// this for the innermost dimension, which has a more complex stepping
|
||||
// structure..
|
||||
for (int i = 0; i < nDims-1; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbStep[i]);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[i]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbTest[i]);
|
||||
ctx->SetCurrentBasicBlock(bbStep[i]);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[i]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
|
||||
ctx->BranchInst(bbTest[i]);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// foreach_test (for all dimensions other than the innermost...)
|
||||
std::vector<llvm::Value *> inExtras;
|
||||
for (int i = 0; i < nDims-1; ++i) {
|
||||
ctx->SetCurrentBasicBlock(bbTest[i]);
|
||||
ctx->SetCurrentBasicBlock(bbTest[i]);
|
||||
|
||||
llvm::Value *haveExtras =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
|
||||
endVals[i], alignedEnd[i], "have_extras");
|
||||
llvm::Value *haveExtras =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
|
||||
endVals[i], alignedEnd[i], "have_extras");
|
||||
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
|
||||
llvm::Value *atAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
counter, alignedEnd[i], "at_aligned_end");
|
||||
llvm::Value *inEx =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
|
||||
atAlignedEnd, "in_extras");
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
|
||||
llvm::Value *atAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
|
||||
counter, alignedEnd[i], "at_aligned_end");
|
||||
llvm::Value *inEx =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
|
||||
atAlignedEnd, "in_extras");
|
||||
|
||||
if (i == 0)
|
||||
inExtras.push_back(inEx);
|
||||
else
|
||||
inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
|
||||
inExtras[i-1], "in_extras_all"));
|
||||
if (i == 0)
|
||||
inExtras.push_back(inEx);
|
||||
else
|
||||
inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
|
||||
inExtras[i-1], "in_extras_all"));
|
||||
|
||||
llvm::Value *varyingCounter =
|
||||
lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
|
||||
dimVariables[i]->storagePtr, span);
|
||||
llvm::Value *varyingCounter =
|
||||
lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
|
||||
dimVariables[i]->storagePtr, span);
|
||||
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[i], LLVMTypes::Int32VectorType, "smear_end");
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[i], LLVMTypes::Int32VectorType, "smear_end");
|
||||
|
||||
// Do a vector compare of its value to the end value to generate a
|
||||
// mask for this last bit of work.
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
// Do a vector compare of its value to the end value to generate a
|
||||
// mask for this last bit of work.
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
|
||||
if (i == 0)
|
||||
ctx->StoreInst(emask, extrasMaskPtrs[i]);
|
||||
else {
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
"extras_mask");
|
||||
ctx->StoreInst(newMask, extrasMaskPtrs[i]);
|
||||
}
|
||||
if (i == 0)
|
||||
ctx->StoreInst(emask, extrasMaskPtrs[i]);
|
||||
else {
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
"extras_mask");
|
||||
ctx->StoreInst(newMask, extrasMaskPtrs[i]);
|
||||
}
|
||||
|
||||
llvm::Value *notAtEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[i]);
|
||||
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
|
||||
llvm::Value *notAtEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[i]);
|
||||
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1578,18 +1844,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// (i.e. processing extra elements that don't exactly fit into a
|
||||
// vector).
|
||||
llvm::BasicBlock *bbOuterInExtras =
|
||||
ctx->CreateBasicBlock("outer_in_extras");
|
||||
ctx->CreateBasicBlock("outer_in_extras");
|
||||
llvm::BasicBlock *bbOuterNotInExtras =
|
||||
ctx->CreateBasicBlock("outer_not_in_extras");
|
||||
ctx->CreateBasicBlock("outer_not_in_extras");
|
||||
|
||||
ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
|
||||
if (inExtras.size())
|
||||
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
|
||||
inExtras.back());
|
||||
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
|
||||
inExtras.back());
|
||||
else
|
||||
// for a 1D iteration domain, we certainly don't have any enclosing
|
||||
// dimensions that are processing extra elements.
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
// for a 1D iteration domain, we certainly don't have any enclosing
|
||||
// dimensions that are processing extra elements.
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// One or more outer dimensions in extras, so we need to mask for the loop
|
||||
@@ -1604,21 +1870,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// // run loop body with mask
|
||||
// }
|
||||
llvm::BasicBlock *bbAllInnerPartialOuter =
|
||||
ctx->CreateBasicBlock("all_inner_partial_outer");
|
||||
ctx->CreateBasicBlock("all_inner_partial_outer");
|
||||
llvm::BasicBlock *bbPartial =
|
||||
ctx->CreateBasicBlock("both_partial");
|
||||
ctx->CreateBasicBlock("both_partial");
|
||||
ctx->SetCurrentBasicBlock(bbOuterInExtras); {
|
||||
// Update the varying counter value here, since all subsequent
|
||||
// blocks along this path need it.
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
// Update the varying counter value here, since all subsequent
|
||||
// blocks along this path need it.
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
|
||||
// here we just check to see if counter < alignedEnd
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
|
||||
// here we just check to see if counter < alignedEnd
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
|
||||
}
|
||||
|
||||
// Below we have a basic block that runs the loop body code for the
|
||||
@@ -1637,53 +1903,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// should step the loop counter for the next enclosing dimension
|
||||
// instead.
|
||||
llvm::Value *stepIndexAfterMaskedBodyPtr =
|
||||
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
|
||||
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// We're in the inner loop part where the only masking is due to outer
|
||||
// dimensions but the innermost dimension fits fully into a vector's
|
||||
// width. Set the mask and jump to the masked loop body.
|
||||
ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
|
||||
llvm::Value *mask;
|
||||
if (nDims == 1)
|
||||
// 1D loop; we shouldn't ever get here anyway
|
||||
mask = LLVMMaskAllOff;
|
||||
else
|
||||
mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
|
||||
llvm::Value *mask;
|
||||
if (nDims == 1)
|
||||
// 1D loop; we shouldn't ever get here anyway
|
||||
mask = LLVMMaskAllOff;
|
||||
else
|
||||
mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
|
||||
|
||||
ctx->SetInternalMask(mask);
|
||||
ctx->SetInternalMask(mask);
|
||||
|
||||
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// We need to include the effect of the innermost dimension in the mask
|
||||
// for the final bits here
|
||||
ctx->SetCurrentBasicBlock(bbPartial); {
|
||||
llvm::Value *varyingCounter =
|
||||
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
|
||||
llvm::Value *varyingCounter =
|
||||
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
|
||||
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
|
||||
if (nDims == 1) {
|
||||
ctx->SetInternalMask(emask);
|
||||
}
|
||||
else {
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
"extras_mask");
|
||||
ctx->SetInternalMask(newMask);
|
||||
}
|
||||
if (nDims == 1) {
|
||||
ctx->SetInternalMask(emask);
|
||||
}
|
||||
else {
|
||||
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
|
||||
llvm::Value *newMask =
|
||||
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
|
||||
"extras_mask");
|
||||
ctx->SetInternalMask(newMask);
|
||||
}
|
||||
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1699,14 +1965,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// // run loop body with mask
|
||||
// }
|
||||
llvm::BasicBlock *bbPartialInnerAllOuter =
|
||||
ctx->CreateBasicBlock("partial_inner_all_outer");
|
||||
ctx->CreateBasicBlock("partial_inner_all_outer");
|
||||
ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
|
||||
beforeAlignedEnd);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeAlignedEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, alignedEnd[nDims-1], "before_aligned_end");
|
||||
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
|
||||
beforeAlignedEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1716,26 +1982,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// value of the varying loop counter and have the statements in the
|
||||
// loop body emit their code.
|
||||
llvm::BasicBlock *bbFullBodyContinue =
|
||||
ctx->CreateBasicBlock("foreach_full_continue");
|
||||
ctx->CreateBasicBlock("foreach_full_continue");
|
||||
ctx->SetCurrentBasicBlock(bbFullBody); {
|
||||
ctx->SetInternalMask(LLVMMaskAllOn);
|
||||
ctx->SetBlockEntryMask(LLVMMaskAllOn);
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
ctx->SetContinueTarget(bbFullBodyContinue);
|
||||
ctx->AddInstrumentationPoint("foreach loop body (all on)");
|
||||
stmts->EmitCode(ctx);
|
||||
AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
|
||||
ctx->BranchInst(bbFullBodyContinue);
|
||||
ctx->SetInternalMask(LLVMMaskAllOn);
|
||||
ctx->SetBlockEntryMask(LLVMMaskAllOn);
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
ctx->SetContinueTarget(bbFullBodyContinue);
|
||||
ctx->AddInstrumentationPoint("foreach loop body (all on)");
|
||||
stmts->EmitCode(ctx);
|
||||
AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
|
||||
ctx->BranchInst(bbFullBodyContinue);
|
||||
}
|
||||
ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterNotInExtras);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1743,33 +2009,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// less than the end value, in which case we need to run the body one
|
||||
// more time to get the extra bits.
|
||||
llvm::BasicBlock *bbSetInnerMask =
|
||||
ctx->CreateBasicBlock("partial_inner_only");
|
||||
ctx->CreateBasicBlock("partial_inner_only");
|
||||
ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeFullEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[nDims-1], "before_full_end");
|
||||
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
|
||||
llvm::Value *beforeFullEnd =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
counter, endVals[nDims-1], "before_full_end");
|
||||
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// The outer dimensions are all on, so the mask is just given by the
|
||||
// mask for the innermost dimension
|
||||
ctx->SetCurrentBasicBlock(bbSetInnerMask); {
|
||||
llvm::Value *varyingCounter =
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
ctx->SetInternalMask(emask);
|
||||
ctx->SetBlockEntryMask(emask);
|
||||
llvm::Value *varyingCounter =
|
||||
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
|
||||
dimVariables[nDims-1]->storagePtr, span);
|
||||
llvm::Value *smearEnd = ctx->BroadcastValue(
|
||||
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
|
||||
llvm::Value *emask =
|
||||
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
|
||||
varyingCounter, smearEnd);
|
||||
emask = ctx->I1VecToBoolVec(emask);
|
||||
ctx->SetInternalMask(emask);
|
||||
ctx->SetBlockEntryMask(emask);
|
||||
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbMaskedBody);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1779,34 +2045,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// mask known to be all-on, which in turn leads to more efficient code
|
||||
// for that case.
|
||||
llvm::BasicBlock *bbStepInnerIndex =
|
||||
ctx->CreateBasicBlock("step_inner_index");
|
||||
ctx->CreateBasicBlock("step_inner_index");
|
||||
llvm::BasicBlock *bbMaskedBodyContinue =
|
||||
ctx->CreateBasicBlock("foreach_masked_continue");
|
||||
ctx->CreateBasicBlock("foreach_masked_continue");
|
||||
ctx->SetCurrentBasicBlock(bbMaskedBody); {
|
||||
ctx->AddInstrumentationPoint("foreach loop body (masked)");
|
||||
ctx->SetContinueTarget(bbMaskedBodyContinue);
|
||||
ctx->DisableGatherScatterWarnings();
|
||||
ctx->SetBlockEntryMask(ctx->GetFullMask());
|
||||
stmts->EmitCode(ctx);
|
||||
ctx->EnableGatherScatterWarnings();
|
||||
ctx->BranchInst(bbMaskedBodyContinue);
|
||||
ctx->AddInstrumentationPoint("foreach loop body (masked)");
|
||||
ctx->SetContinueTarget(bbMaskedBodyContinue);
|
||||
ctx->DisableGatherScatterWarnings();
|
||||
ctx->SetBlockEntryMask(ctx->GetFullMask());
|
||||
stmts->EmitCode(ctx);
|
||||
ctx->EnableGatherScatterWarnings();
|
||||
ctx->BranchInst(bbMaskedBodyContinue);
|
||||
}
|
||||
ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
|
||||
ctx->RestoreContinuedLanes();
|
||||
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
|
||||
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// step the innermost index, for the case where we're doing the
|
||||
// innermost for loop over full vectors.
|
||||
ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterInExtras);
|
||||
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
|
||||
llvm::Value *newCounter =
|
||||
ctx->BinaryOperator(llvm::Instruction::Add, counter,
|
||||
LLVMInt32(span[nDims-1]), "new_counter");
|
||||
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
|
||||
ctx->BranchInst(bbOuterInExtras);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
// math...)
|
||||
|
||||
// Get the "program index" vector value
|
||||
llvm::Value *programIndex = ctx->ProgramIndexVector();
|
||||
llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
|
||||
ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
|
||||
|
||||
// And smear the current lane out to a vector
|
||||
llvm::Value *firstSet32 =
|
||||
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
|
||||
// And load the corresponding element value from the temporary
|
||||
// memory storing the value of the varying expr.
|
||||
llvm::Value *uniqueValuePtr =
|
||||
llvm::Value *uniqueValue;
|
||||
if (g->target->getISA() != Target::NVPTX)
|
||||
{
|
||||
llvm::Value *uniqueValuePtr =
|
||||
ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
|
||||
"unique_index_ptr");
|
||||
llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
|
||||
"unique_index_ptr");
|
||||
uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
|
||||
}
|
||||
else /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
|
||||
{
|
||||
llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
|
||||
uniqueValue = ctx->Extract(exprValue, firstSet32);
|
||||
}
|
||||
|
||||
// If it's a varying pointer type, need to convert from the int
|
||||
// type we store in the vector to the actual pointer type
|
||||
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
|
||||
}
|
||||
|
||||
// Now we can emit code to call __do_print()
|
||||
llvm::Function *printFunc = m->module->getFunction("__do_print");
|
||||
llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
|
||||
m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
|
||||
AssertPos(pos, printFunc);
|
||||
|
||||
llvm::Value *mask = ctx->GetFullMask();
|
||||
|
||||
440
test_static_cuda.cpp
Normal file
440
test_static_cuda.cpp
Normal file
@@ -0,0 +1,440 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
/******************************/
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
#include "drvapi_error_string.h"
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
|
||||
// These are the inline versions for all of the SDK helper functions
|
||||
void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||
if(CUDA_SUCCESS != err) {
|
||||
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
|
||||
<< getCudaDrvErrorString(err) << "\" from file <" << file
|
||||
<< ", line " << line << "\n";
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/******************************/
|
||||
/**** Basic CUDriver API ****/
|
||||
/******************************/
|
||||
|
||||
CUcontext context;
|
||||
|
||||
static void createContext(const int deviceId = 0, const bool verbose = true)
|
||||
{
|
||||
CUdevice device;
|
||||
int devCount;
|
||||
checkCudaErrors(cuInit(0));
|
||||
checkCudaErrors(cuDeviceGetCount(&devCount));
|
||||
assert(devCount > 0);
|
||||
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
|
||||
|
||||
char name[128];
|
||||
checkCudaErrors(cuDeviceGetName(name, 128, device));
|
||||
if (verbose)
|
||||
std::cout << "Using CUDA Device [0]: " << name << "\n";
|
||||
|
||||
int devMajor, devMinor;
|
||||
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
|
||||
if (verbose)
|
||||
std::cout << "Device Compute Capability: "
|
||||
<< devMajor << "." << devMinor << "\n";
|
||||
if (devMajor < 2) {
|
||||
if (verbose)
|
||||
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
// Create driver context
|
||||
checkCudaErrors(cuCtxCreate(&context, 0, device));
|
||||
}
|
||||
static void destroyContext()
|
||||
{
|
||||
checkCudaErrors(cuCtxDestroy(context));
|
||||
}
|
||||
|
||||
static CUmodule loadModule(
|
||||
const char * module,
|
||||
const int maxrregcount = 64,
|
||||
const char cudadevrt_lib[] = "libcudadevrt.a",
|
||||
const size_t log_size = 32768,
|
||||
const bool print_log = true
|
||||
)
|
||||
{
|
||||
CUmodule cudaModule;
|
||||
// in this branch we use compilation with parameters
|
||||
|
||||
CUlinkState CUState;
|
||||
CUlinkState *lState = &CUState;
|
||||
const int nOptions = 8;
|
||||
CUjit_option options[nOptions];
|
||||
void* optionVals[nOptions];
|
||||
float walltime;
|
||||
size_t logSize = log_size;
|
||||
char error_log[logSize],
|
||||
info_log[logSize];
|
||||
void *cuOut;
|
||||
size_t outSize;
|
||||
int myErr = 0;
|
||||
|
||||
// Setup linker options
|
||||
// Return walltime from JIT compilation
|
||||
options[0] = CU_JIT_WALL_TIME;
|
||||
optionVals[0] = (void*) &walltime;
|
||||
// Pass a buffer for info messages
|
||||
options[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
optionVals[1] = (void*) info_log;
|
||||
// Pass the size of the info buffer
|
||||
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
optionVals[2] = (void*) logSize;
|
||||
// Pass a buffer for error message
|
||||
options[3] = CU_JIT_ERROR_LOG_BUFFER;
|
||||
optionVals[3] = (void*) error_log;
|
||||
// Pass the size of the error buffer
|
||||
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
|
||||
optionVals[4] = (void*) logSize;
|
||||
// Make the linker verbose
|
||||
options[5] = CU_JIT_LOG_VERBOSE;
|
||||
optionVals[5] = (void*) 1;
|
||||
// Max # of registers/pthread
|
||||
options[6] = CU_JIT_MAX_REGISTERS;
|
||||
int jitRegCount = maxrregcount;
|
||||
optionVals[6] = (void *)(size_t)jitRegCount;
|
||||
// Caching
|
||||
options[7] = CU_JIT_CACHE_MODE;
|
||||
optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
|
||||
// Create a pending linker invocation
|
||||
|
||||
// Create a pending linker invocation
|
||||
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
|
||||
|
||||
#if 0
|
||||
if (sizeof(void *)==4)
|
||||
{
|
||||
// Load the PTX from the string myPtx32
|
||||
printf("Loading myPtx32[] program\n");
|
||||
// PTX May also be loaded from file, as per below.
|
||||
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
// Load the PTX from the string myPtx (64-bit)
|
||||
if (print_log)
|
||||
fprintf(stderr, "Loading ptx..\n");
|
||||
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
|
||||
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0);
|
||||
// PTX May also be loaded from file, as per below.
|
||||
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
|
||||
}
|
||||
|
||||
// Complete the linker step
|
||||
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
|
||||
|
||||
if ( myErr != CUDA_SUCCESS )
|
||||
{
|
||||
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
|
||||
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
|
||||
assert(0);
|
||||
}
|
||||
|
||||
// Linker walltime and info_log were requested in options above.
|
||||
if (print_log)
|
||||
fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
|
||||
|
||||
// Load resulting cuBin into module
|
||||
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
|
||||
|
||||
// Destroy the linker invocation
|
||||
checkCudaErrors(cuLinkDestroy(*lState));
|
||||
return cudaModule;
|
||||
}
|
||||
static void unloadModule(CUmodule &cudaModule)
|
||||
{
|
||||
checkCudaErrors(cuModuleUnload(cudaModule));
|
||||
}
|
||||
|
||||
static CUfunction getFunction(CUmodule &cudaModule, const char * function)
|
||||
{
|
||||
CUfunction cudaFunction;
|
||||
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
|
||||
return cudaFunction;
|
||||
}
|
||||
|
||||
static CUdeviceptr deviceMalloc(const size_t size)
|
||||
{
|
||||
CUdeviceptr d_buf;
|
||||
checkCudaErrors(cuMemAlloc(&d_buf, size));
|
||||
return d_buf;
|
||||
}
|
||||
static void deviceFree(CUdeviceptr d_buf)
|
||||
{
|
||||
checkCudaErrors(cuMemFree(d_buf));
|
||||
}
|
||||
static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
|
||||
}
|
||||
static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
|
||||
{
|
||||
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
|
||||
}
|
||||
#define deviceLaunch(func,params) \
|
||||
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
|
||||
checkCudaErrors( \
|
||||
cuLaunchKernel( \
|
||||
(func), \
|
||||
1,1,1, \
|
||||
32, 1, 1, \
|
||||
0, NULL, (params), NULL \
|
||||
));
|
||||
|
||||
typedef CUdeviceptr devicePtr;
|
||||
|
||||
|
||||
/**************/
|
||||
#include <vector>
|
||||
static std::vector<char> readBinary(const char * filename, const bool print_size = false)
|
||||
{
|
||||
std::vector<char> buffer;
|
||||
FILE *fp = fopen(filename, "rb");
|
||||
if (!fp )
|
||||
{
|
||||
fprintf(stderr, "file %s not found\n", filename);
|
||||
assert(0);
|
||||
}
|
||||
fseek(fp, 0, SEEK_END);
|
||||
const unsigned long long size = ftell(fp); /*calc the size needed*/
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
buffer.resize(size);
|
||||
|
||||
if (fp == NULL){ /*ERROR detection if file == empty*/
|
||||
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
|
||||
exit(1);
|
||||
}
|
||||
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
|
||||
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
|
||||
exit(1);
|
||||
}
|
||||
if (print_size)
|
||||
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static double CUDALaunch(
|
||||
void **handlePtr,
|
||||
const char * func_name,
|
||||
void **func_args,
|
||||
const bool print_log = true,
|
||||
const int maxrregcount = 64,
|
||||
const char kernel_file[] = "__kernels.ptx",
|
||||
const char cudadevrt_lib[] = "libcudadevrt.a",
|
||||
const int log_size = 32768)
|
||||
{
|
||||
fprintf(stderr, " launching kernel: %s \n", func_name);
|
||||
const std::vector<char> module_str = readBinary(kernel_file, print_log);
|
||||
const char * module = &module_str[0];
|
||||
CUmodule cudaModule = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
|
||||
CUfunction cudaFunction = getFunction(cudaModule, func_name);
|
||||
deviceLaunch(cudaFunction, func_args);
|
||||
checkCudaErrors(cuStreamSynchronize(0));
|
||||
unloadModule(cudaModule);
|
||||
return 0.0;
|
||||
}
|
||||
/******************************/
|
||||
|
||||
|
||||
extern "C" {
|
||||
// extern int width();
|
||||
int width() { return 32; }
|
||||
extern void f_v(float *result);
|
||||
extern void f_f(float *result, float *a);
|
||||
extern void f_fu(float *result, float *a, float b);
|
||||
extern void f_fi(float *result, float *a, int *b);
|
||||
extern void f_du(float *result, double *a, double b);
|
||||
extern void f_duf(float *result, double *a, float b);
|
||||
extern void f_di(float *result, double *a, int *b);
|
||||
extern void result(float *val);
|
||||
}
|
||||
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ALIGN
|
||||
#else
|
||||
#define ALIGN __attribute__((aligned(64)))
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int w = width();
|
||||
assert(w <= 64);
|
||||
|
||||
float returned_result[64] ALIGN;
|
||||
float vfloat[64] ALIGN;
|
||||
double vdouble[64] ALIGN;
|
||||
int vint[64] ALIGN;
|
||||
int vint2[64] ALIGN;
|
||||
|
||||
const int device = 0;
|
||||
#if 0
|
||||
const bool verbose = true;
|
||||
#else
|
||||
const bool verbose = false;
|
||||
#endif
|
||||
|
||||
/*******************/
|
||||
createContext(device, verbose);
|
||||
/*******************/
|
||||
|
||||
devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
|
||||
devicePtr d_vfloat = deviceMalloc(64*sizeof(float));
|
||||
devicePtr d_vdouble = deviceMalloc(64*sizeof(double));
|
||||
devicePtr d_vint = deviceMalloc(64*sizeof(int));
|
||||
devicePtr d_vint2 = deviceMalloc(64*sizeof(int));
|
||||
|
||||
|
||||
for (int i = 0; i < 64; ++i) {
|
||||
returned_result[i] = -1e20;
|
||||
vfloat[i] = i+1;
|
||||
vdouble[i] = i+1;
|
||||
vint[i] = 2*(i+1);
|
||||
vint2[i] = i+5;
|
||||
}
|
||||
|
||||
memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
|
||||
memcpyH2D(d_vfloat , vfloat, 64*sizeof(float));
|
||||
memcpyH2D(d_vdouble , vdouble, 64*sizeof(double));
|
||||
memcpyH2D(d_vint , vint, 64*sizeof(int));
|
||||
memcpyH2D(d_vint2 , vint2, 64*sizeof(int));
|
||||
|
||||
|
||||
float b = 5.;
|
||||
|
||||
const bool print_log = false;
|
||||
const int nreg = 64;
|
||||
#if (TEST_SIG == 0)
|
||||
void *args[] = {&d_returned_result};
|
||||
CUDALaunch(NULL, "f_v", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 1)
|
||||
void *args[] = {&d_returned_result, &d_vfloat};
|
||||
CUDALaunch(NULL, "f_f", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 2)
|
||||
void *args[] = {&d_returned_result, &d_vfloat, &b};
|
||||
CUDALaunch(NULL, "f_fu", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 3)
|
||||
void *args[] = {&d_returned_result, &d_vfloat, &vint};
|
||||
CUDALaunch(NULL, "f_fi", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 4)
|
||||
int num = 5;
|
||||
void *args[] = {&d_returned_result, &d_vdouble, &num};
|
||||
CUDALaunch(NULL, "f_du", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 5)
|
||||
float num = 5.0f;
|
||||
void *args[] = {&d_returned_result, &d_vdouble, &num};
|
||||
CUDALaunch(NULL, "f_duf", args, print_log, nreg);
|
||||
#elif (TEST_SIG == 6)
|
||||
void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
|
||||
CUDALaunch(NULL, "f_di", args, print_log, nreg);
|
||||
#else
|
||||
#error "Unknown or unset TEST_SIG value"
|
||||
#endif
|
||||
|
||||
float expected_result[64];
|
||||
|
||||
memset(expected_result, 0, 64*sizeof(float));
|
||||
devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
|
||||
memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
|
||||
void *res_args[] = {&d_expected_result};
|
||||
CUDALaunch(NULL, "result", res_args, print_log, nreg);
|
||||
memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
|
||||
memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
|
||||
|
||||
deviceFree(d_returned_result);
|
||||
deviceFree(d_vfloat);
|
||||
deviceFree(d_vdouble);
|
||||
deviceFree(d_vint);
|
||||
deviceFree(d_vint2);
|
||||
deviceFree(d_expected_result);
|
||||
|
||||
/*******************/
|
||||
destroyContext();
|
||||
/*******************/
|
||||
|
||||
int errors = 0;
|
||||
for (int i = 0; i < w; ++i) {
|
||||
if (returned_result[i] != expected_result[i]) {
|
||||
#ifdef EXPECT_FAILURE
|
||||
// bingo, failed
|
||||
return 1;
|
||||
#else
|
||||
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||
argv[0], i, returned_result[i], returned_result[i],
|
||||
expected_result[i], expected_result[i]);
|
||||
++errors;
|
||||
#endif // EXPECT_FAILURE
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EXPECT_FAILURE
|
||||
// Don't expect to get here
|
||||
return 0;
|
||||
#else
|
||||
return errors > 0;
|
||||
#endif
|
||||
}
|
||||
133
test_static_nvptx.cpp
Normal file
133
test_static_nvptx.cpp
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
Copyright (c) 2010-2011, Intel Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of Intel Corporation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
|
||||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
||||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
||||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
#define ISPC_IS_WINDOWS
|
||||
#elif defined(__linux__)
|
||||
#define ISPC_IS_LINUX
|
||||
#elif defined(__APPLE__)
|
||||
#define ISPC_IS_APPLE
|
||||
#endif
|
||||
|
||||
#ifdef ISPC_IS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include <stdint.h>
|
||||
#ifdef ISPC_IS_LINUX
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
#include "ispc_malloc.h"
|
||||
|
||||
#define N 32
|
||||
extern "C" {
|
||||
int width() { return N; }
|
||||
extern void f_v(float *result);
|
||||
extern void f_f(float *result, float *a);
|
||||
extern void f_fu(float *result, float *a, float b);
|
||||
extern void f_fi(float *result, float *a, int *b);
|
||||
extern void f_du(float *result, double *a, double b);
|
||||
extern void f_duf(float *result, double *a, float b);
|
||||
extern void f_di(float *result, double *a, int *b);
|
||||
extern void result(float *val);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int w = width();
|
||||
assert(w <= N);
|
||||
|
||||
float *returned_result = new float[N*4];
|
||||
float *vfloat = new float[N*4];
|
||||
double *vdouble = new double[N*4];
|
||||
int *vint = new int[N*4];
|
||||
int *vint2 = new int[N*4];
|
||||
|
||||
for (int i = 0; i < N*4; ++i) {
|
||||
returned_result[i] = -1e20;
|
||||
vfloat[i] = i+1;
|
||||
vdouble[i] = i+1;
|
||||
vint[i] = 2*(i+1);
|
||||
vint2[i] = i+5;
|
||||
}
|
||||
|
||||
float b = 5.;
|
||||
|
||||
#if (TEST_SIG == 0)
|
||||
f_v(returned_result);
|
||||
#elif (TEST_SIG == 1)
|
||||
f_f(returned_result, vfloat);
|
||||
#elif (TEST_SIG == 2)
|
||||
f_fu(returned_result, vfloat, b);
|
||||
#elif (TEST_SIG == 3)
|
||||
f_fi(returned_result, vfloat, vint);
|
||||
#elif (TEST_SIG == 4)
|
||||
f_du(returned_result, vdouble, 5.);
|
||||
#elif (TEST_SIG == 5)
|
||||
f_duf(returned_result, vdouble, 5.f);
|
||||
#elif (TEST_SIG == 6)
|
||||
f_di(returned_result, vdouble, vint2);
|
||||
#else
|
||||
#error "Unknown or unset TEST_SIG value"
|
||||
#endif
|
||||
|
||||
float *expected_result = new float[N];
|
||||
memset(expected_result, 0, N*sizeof(float));
|
||||
result(expected_result);
|
||||
|
||||
int errors = 0;
|
||||
for (int i = 0; i < w; ++i) {
|
||||
if (returned_result[i] != expected_result[i])
|
||||
{
|
||||
#ifdef EXPECT_FAILURE
|
||||
// bingo, failed
|
||||
return 1;
|
||||
#else
|
||||
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
|
||||
argv[0], i, returned_result[i], returned_result[i],
|
||||
expected_result[i], expected_result[i]);
|
||||
++errors;
|
||||
#endif // EXPECT_FAILURE
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef EXPECT_FAILURE
|
||||
// Don't expect to get here
|
||||
return 0;
|
||||
#else
|
||||
return errors > 0;
|
||||
#endif
|
||||
}
|
||||
@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
assert(programCount <= 64);
|
||||
#ifdef __NVPTX__
|
||||
uniform float * uniform xarr = uniform new uniform float[70*70];
|
||||
uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
|
||||
#define _SHMALLOC
|
||||
#else
|
||||
uniform float x[70][70];
|
||||
#endif
|
||||
for (uniform int i = 0; i < 70; ++i)
|
||||
for (uniform int j = 0; j < 70; ++j)
|
||||
x[i][j] = 2+b-5;
|
||||
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
else
|
||||
x[b-1][a-1] = 1;
|
||||
RET[programIndex] = x[4][a];
|
||||
|
||||
#ifdef _SHMALLOC
|
||||
delete xarr;
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
|
||||
@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
float b = (programCount == 1) ? 3 : broadcast(a, 2);
|
||||
float b = (programCount == 1) ? 4 : broadcast(a, 2);
|
||||
RET[programIndex] = b;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 2;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 5;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 6;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 2;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 5;
|
||||
RET[i+3] = 6;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 3;
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 32;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 38;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 39;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+2] = 38;
|
||||
RET[i+3] = 39;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
struct Foo {
|
||||
uniform float x[17];
|
||||
uniform float x[programCount+1];
|
||||
};
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform Foo foo;
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i)
|
||||
cfor (i = 0; i < programCount+1; ++i)
|
||||
foo.x[i] = i;
|
||||
|
||||
if ((int)a & 1)
|
||||
|
||||
@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform double udx[25][25];
|
||||
cfor (uniform int i = 0; i < 25; ++i)
|
||||
cfor (uniform int j = 0; j < 25; ++j)
|
||||
uniform double udx[programCount+1][programCount+1];
|
||||
cfor (uniform int i = 0; i < programCount+1; ++i)
|
||||
cfor (uniform int j = 0; j < programCount+1; ++j)
|
||||
udx[i][j] = 10*i+j;
|
||||
|
||||
int x = 1;
|
||||
|
||||
@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform float udx[20][20];
|
||||
cfor (uniform int i = 0; i < 20; ++i)
|
||||
cfor (uniform int j = 0; j < 20; ++j)
|
||||
uniform float udx[programCount+1][programCount+1];
|
||||
cfor (uniform int i = 0; i < programCount+1; ++i)
|
||||
cfor (uniform int j = 0; j < programCount+1; ++j)
|
||||
udx[i][j] = 100*i+j;
|
||||
|
||||
int x = 1;
|
||||
|
||||
@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
struct Foo {
|
||||
uniform float udx[25][25];
|
||||
uniform float udx[32][32];
|
||||
};
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
#ifndef __NVPTX__
|
||||
uniform Foo f[5];
|
||||
#else /* too much shared memory allocated, nvcc fails to link */
|
||||
uniform Foo * uniform f = uniform new uniform Foo[5];
|
||||
#define _UNMALLOC
|
||||
#endif
|
||||
cfor (uniform int i = 0; i < 5; ++i)
|
||||
cfor (uniform int j = 0; j < 25; ++j)
|
||||
cfor (uniform int k = 0; k < 25; ++k)
|
||||
cfor (uniform int j = 0; j < 32; ++j)
|
||||
cfor (uniform int k = 0; k < 32; ++k)
|
||||
f[i].udx[j][k] = 1000*i+100*j+k;
|
||||
|
||||
int x = 1;
|
||||
RET[programIndex] = f[x+1].udx[b-4][programIndex];
|
||||
#ifdef _UNMALLOC
|
||||
delete f;
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }
|
||||
|
||||
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
Foo foo[17];
|
||||
Foo foo[programCount+1];
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i)
|
||||
cfor (i = 0; i < programCount+1; ++i)
|
||||
foo[i].f = i*a;
|
||||
RET[programIndex] = func(foo, (int)a);
|
||||
}
|
||||
|
||||
@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
Foo foo[17];
|
||||
Foo foo[programCount+1];
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i)
|
||||
cfor (i = 0; i < programCount+1; ++i)
|
||||
foo[i].f = i*a;
|
||||
RET[programIndex] = func(foo, (int)a);
|
||||
}
|
||||
|
||||
@@ -9,9 +9,9 @@ struct Foo {
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
Foo foo[17];
|
||||
Foo foo[programCount+1];
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i)
|
||||
cfor (i = 0; i < programCount+1; ++i)
|
||||
foo[i].f = i*a;
|
||||
RET[programIndex] = foo[(int)a].f;
|
||||
}
|
||||
|
||||
@@ -10,9 +10,9 @@ struct Foo {
|
||||
export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
int b = bFOO[programIndex];
|
||||
varying Foo myFoo[17];
|
||||
varying Foo myFoo[programCount+1];
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i) {
|
||||
cfor (i = 0; i < programCount+1; ++i) {
|
||||
myFoo[i].x = i;
|
||||
myFoo[i].f = 2*i;
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 2;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 5;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 6;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 2;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 5;
|
||||
RET[i+3] = 6;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 32;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 32;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 38;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 39;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 32;
|
||||
RET[i+1] = 32;
|
||||
RET[i+2] = 38;
|
||||
RET[i+3] = 39;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,9 +8,9 @@ struct Foo {
|
||||
};
|
||||
export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
|
||||
int b = bFOO[programIndex];
|
||||
uniform struct Foo myFoo[17];
|
||||
uniform struct Foo myFoo[programCount+1];
|
||||
uniform int i;
|
||||
cfor (i = 0; i < 17; ++i) {
|
||||
cfor (i = 0; i < programCount+1; ++i) {
|
||||
myFoo[i].x = i;
|
||||
myFoo[i].f = 2*i;
|
||||
}
|
||||
|
||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
||||
static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
||||
const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
|
||||
RET[programIndex] = (x == y) ? 1. : 0.;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform int x = (170 >> 4) % 5;
|
||||
static uniform int y = (170 >> 4) % 5;
|
||||
const static uniform int y = (170 >> 4) % 5;
|
||||
RET[programIndex] = (x == y) ? 1. : 0.;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
|
||||
static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
|
||||
const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
|
||||
RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
|
||||
}
|
||||
|
||||
|
||||
@@ -2,22 +2,23 @@
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
#define N0 10
|
||||
#define N0 12
|
||||
#define N1 20
|
||||
#define N2 50
|
||||
static uniform float array[N2][N1][N0];
|
||||
|
||||
task void x(const float f) {
|
||||
task void x(const uniform float farray[]) {
|
||||
const float f = farray[programIndex];
|
||||
uniform int j;
|
||||
|
||||
assert(taskCount == (int32)N0*N1*N2);
|
||||
assert(taskCount0 == (int32)N0);
|
||||
assert(taskCount1 == (int32)N1);
|
||||
assert(taskCount2 == (int32)N2);
|
||||
assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
|
||||
assert(taskIndex0 < (int32)N0);
|
||||
assert(taskIndex1 < (int32)N1);
|
||||
assert(taskIndex2 < (int32)N2);
|
||||
assert(taskCount == (uniform int32)N0*N1*N2);
|
||||
assert(taskCount0 == (uniform int32)N0);
|
||||
assert(taskCount1 == (uniform int32)N1);
|
||||
assert(taskCount2 == (uniform int32)N2);
|
||||
assert(taskIndex == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
|
||||
assert(taskIndex0 < (uniform int32)N0);
|
||||
assert(taskIndex1 < (uniform int32)N1);
|
||||
assert(taskIndex2 < (uniform int32)N2);
|
||||
|
||||
const uniform int i0 = taskIndex0;
|
||||
const uniform int i1 = taskIndex1;
|
||||
@@ -30,7 +31,7 @@ task void x(const float f) {
|
||||
array[i2][i1][i0] = i;
|
||||
}
|
||||
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
||||
float f = fFOO[programIndex];
|
||||
uniform float * uniform f = fFOO;
|
||||
launch[N2][N1][N0] x(f);
|
||||
sync;
|
||||
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
||||
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 9999.000000;
|
||||
RET[programIndex] = 11999.000000;
|
||||
}
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
#define N0 10
|
||||
#define N0 12
|
||||
#define N1 20
|
||||
#define N2 50
|
||||
static uniform float array[N2][N1][N0];
|
||||
|
||||
task void x(const float f) {
|
||||
task void x(const uniform float farray[]) {
|
||||
const float f = farray[programIndex];
|
||||
uniform int j;
|
||||
|
||||
assert(taskCount == (int32)N0*N1*N2);
|
||||
@@ -30,13 +31,13 @@ task void x(const float f) {
|
||||
array[i2][i1][i0] = i;
|
||||
}
|
||||
export void f_f(uniform float RET[], uniform float fFOO[]) {
|
||||
float f = fFOO[programIndex];
|
||||
launch[N0,N1,N2] x(f);
|
||||
uniform float * uniform f = fFOO;
|
||||
launch[N2][N1][N0] x(f);
|
||||
sync;
|
||||
RET[programIndex] = array[N2-1][N1-1][N0-1];
|
||||
}
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 9999.000000;
|
||||
RET[programIndex] = 11999.000000;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,9 @@
|
||||
#ifdef __NVPTX__
|
||||
uniform int _off[programCount];
|
||||
#define off _off[programIndex]
|
||||
#else /* global varying data types are not yet supported with "nvptx" target */
|
||||
int off;
|
||||
#endif
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
|
||||
return c;
|
||||
}
|
||||
|
||||
struct S a;
|
||||
struct S b;
|
||||
struct S d;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
struct S a;
|
||||
struct S b;
|
||||
struct S d;
|
||||
int T = programIndex;
|
||||
a.a = aFOO[programIndex];
|
||||
b.a = -aFOO[programIndex];
|
||||
|
||||
@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||
soa<4> Point pts[10];
|
||||
for (uniform int i = 0; i < 40; ++i) {
|
||||
pts[i].x = b*i;
|
||||
pts[i].y[0] = 2*b*i;
|
||||
pts[i].y[1] = 2*b*i+1;
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#else
|
||||
soa<4> Point pts[30];
|
||||
for (uniform int i = 0; i < 120; ++i) {
|
||||
pts[i].x = b*i;
|
||||
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#endif
|
||||
|
||||
float a = aFOO[programIndex];
|
||||
a *= -1;
|
||||
|
||||
@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
|
||||
}
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||
soa<4> Point pts[15];
|
||||
for (uniform int i = 0; i < 60; ++i) {
|
||||
pts[i].x = b*i;
|
||||
pts[i].y[0] = 2*b*i;
|
||||
pts[i].y[1] = 2*b*i+1;
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#else
|
||||
soa<4> Point pts[40];
|
||||
for (uniform int i = 0; i < 160; ++i) {
|
||||
pts[i].x = b*i;
|
||||
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#endif
|
||||
|
||||
float a = aFOO[programIndex];
|
||||
a *= -1;
|
||||
|
||||
@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
float a = aFOO[programIndex];
|
||||
|
||||
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
|
||||
soa<8> Point pts[4];
|
||||
//CO uniform Point pts[80];
|
||||
foreach (i = 0 ... 40) {
|
||||
pts[i].x = b*i;
|
||||
pts[i].y[0] = 2*b*i;
|
||||
pts[i].y[1] = 2*b*i+1;
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#else
|
||||
soa<8> Point pts[10];
|
||||
//CO uniform Point pts[80];
|
||||
foreach (i = 0 ... 80) {
|
||||
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
pts[i].y[2] = 2*b*i+2;
|
||||
pts[i].z = 3*b*i;
|
||||
}
|
||||
#endif
|
||||
|
||||
assert(programCount < 80);
|
||||
RET[programIndex] = pts[programIndex].y[2];
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 1;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 3;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 3;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 29;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 1;
|
||||
RET[i+1] = 3;
|
||||
RET[i+2] = 3;
|
||||
RET[i+3] = 29;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
|
||||
RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
|
||||
RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0;
|
||||
RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
|
||||
for (int i = 0; i < programCount; i += 4)
|
||||
{
|
||||
RET[i+0] = 0x0.0p+0;
|
||||
RET[i+1] = 0x1.62e43p-1;
|
||||
RET[i+2] = 0x1.193ea8p+0;
|
||||
RET[i+3] = 0x1.62e43p+0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
// calculation error 1e-6 is the same as in icc
|
||||
RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
|
||||
RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[4]) {
|
||||
|
||||
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
RET[programIndex] = round(a+.499999);
|
||||
RET[programIndex] = round(a+.49999);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
|
||||
@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
float a = aFOO[programIndex];
|
||||
RET[programIndex] = floor(a+.999999);
|
||||
RET[programIndex] = floor(a+.99999);
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
|
||||
34
tests/uniform-1.ispc
Normal file
34
tests/uniform-1.ispc
Normal file
@@ -0,0 +1,34 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
|
||||
task void f_f_task(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float val[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
val[i] = 0;
|
||||
|
||||
foreach (i = 0 ... programCount)
|
||||
val[i] += aFOO[programCount*taskIndex + i] - 1;
|
||||
|
||||
uniform float sum = 0;
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
sum += val[i];
|
||||
|
||||
if (programIndex < 32/4)
|
||||
RET[programCount/4*taskIndex + programIndex] = sum;
|
||||
}
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[])
|
||||
{
|
||||
launch[4] f_f_task(RET, aFOO);
|
||||
}
|
||||
task void result_task(uniform float RET[])
|
||||
{
|
||||
const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
|
||||
if (programIndex < 32/4)
|
||||
RET[programCount/4*taskIndex + programIndex] = ret;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
launch[4] result_task(RET);
|
||||
}
|
||||
10
type.cpp
10
type.cpp
@@ -749,7 +749,7 @@ EnumType::Mangle() const {
|
||||
std::string ret;
|
||||
if (isConst) ret += "C";
|
||||
ret += variability.MangleString();
|
||||
ret += std::string("enum[") + name + std::string("]");
|
||||
ret += std::string("enum_5B_") + name + std::string("_5C_");
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
|
||||
sprintf(buf, "%d", numElements);
|
||||
else
|
||||
buf[0] = '\0';
|
||||
return s + "[" + buf + "]";
|
||||
return s + "_5B_" + buf + "_5C_";
|
||||
}
|
||||
|
||||
|
||||
@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
|
||||
Assert(variability != Variability::Unbound);
|
||||
|
||||
std::string ret;
|
||||
ret += "s[";
|
||||
ret += "s_5B_";
|
||||
if (isConst)
|
||||
ret += "_c_";
|
||||
ret += variability.MangleString();
|
||||
|
||||
ret += name + std::string("]");
|
||||
ret += name + std::string("_5C_");
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
|
||||
llvmArgTypes.push_back(LLVMTypes::MaskType);
|
||||
|
||||
std::vector<llvm::Type *> callTypes;
|
||||
if (isTask) {
|
||||
if (isTask && g->target->getISA() != Target::NVPTX) {
|
||||
// Tasks take three arguments: a pointer to a struct that holds the
|
||||
// actual task arguments, the thread index, and the total number of
|
||||
// threads the tasks system has running. (Task arguments are
|
||||
|
||||
Reference in New Issue
Block a user