merged with nvptx

This commit is contained in:
Evghenii
2014-02-20 11:01:58 +01:00
68 changed files with 8181 additions and 470 deletions

View File

@@ -338,11 +338,13 @@ lSetInternalFunctions(llvm::Module *module) {
"__all",
"__any",
"__aos_to_soa3_float",
"__aos_to_soa3_float1",
"__aos_to_soa3_float16",
"__aos_to_soa3_float4",
"__aos_to_soa3_float8",
"__aos_to_soa3_int32",
"__aos_to_soa4_float",
"__aos_to_soa4_float1",
"__aos_to_soa4_float16",
"__aos_to_soa4_float4",
"__aos_to_soa4_float8",
@@ -351,10 +353,14 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_add_int64_global",
"__atomic_add_uniform_int32_global",
"__atomic_add_uniform_int64_global",
"__atomic_add_varying_int32_global",
"__atomic_add_varying_int64_global",
"__atomic_and_int32_global",
"__atomic_and_int64_global",
"__atomic_and_uniform_int32_global",
"__atomic_and_uniform_int64_global",
"__atomic_and_varying_int32_global",
"__atomic_and_varying_int64_global",
"__atomic_compare_exchange_double_global",
"__atomic_compare_exchange_float_global",
"__atomic_compare_exchange_int32_global",
@@ -363,18 +369,30 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_compare_exchange_uniform_float_global",
"__atomic_compare_exchange_uniform_int32_global",
"__atomic_compare_exchange_uniform_int64_global",
"__atomic_compare_exchange_varying_double_global",
"__atomic_compare_exchange_varying_float_global",
"__atomic_compare_exchange_varying_int32_global",
"__atomic_compare_exchange_varying_int64_global",
"__atomic_max_uniform_int32_global",
"__atomic_max_uniform_int64_global",
"__atomic_min_uniform_int32_global",
"__atomic_min_uniform_int64_global",
"__atomic_max_varying_int32_global",
"__atomic_max_varying_int64_global",
"__atomic_min_varying_int32_global",
"__atomic_min_varying_int64_global",
"__atomic_or_int32_global",
"__atomic_or_int64_global",
"__atomic_or_uniform_int32_global",
"__atomic_or_uniform_int64_global",
"__atomic_or_varying_int32_global",
"__atomic_or_varying_int64_global",
"__atomic_sub_int32_global",
"__atomic_sub_int64_global",
"__atomic_sub_uniform_int32_global",
"__atomic_sub_uniform_int64_global",
"__atomic_sub_varying_int32_global",
"__atomic_sub_varying_int64_global",
"__atomic_swap_double_global",
"__atomic_swap_float_global",
"__atomic_swap_int32_global",
@@ -383,14 +401,28 @@ lSetInternalFunctions(llvm::Module *module) {
"__atomic_swap_uniform_float_global",
"__atomic_swap_uniform_int32_global",
"__atomic_swap_uniform_int64_global",
"__atomic_swap_varying_double_global",
"__atomic_swap_varying_float_global",
"__atomic_swap_varying_int32_global",
"__atomic_swap_varying_int64_global",
"__atomic_umax_uniform_uint32_global",
"__atomic_umax_uniform_uint64_global",
"__atomic_umin_uniform_uint32_global",
"__atomic_umin_uniform_uint64_global",
"__atomic_umax_varying_uint32_global",
"__atomic_umax_varying_uint64_global",
"__atomic_umin_varying_uint32_global",
"__atomic_umin_varying_uint64_global",
"__atomic_xor_int32_global",
"__atomic_xor_int64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
"__atomic_xor_uniform_int32_global",
"__atomic_xor_uniform_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
"__atomic_xor_varying_int32_global",
"__atomic_xor_varying_int64_global",
"__broadcast_double",
"__broadcast_float",
"__broadcast_i16",
@@ -413,6 +445,7 @@ lSetInternalFunctions(llvm::Module *module) {
"__do_assert_uniform",
"__do_assert_varying",
"__do_print",
"__do_print_nvptx",
"__doublebits_uniform_int64",
"__doublebits_varying_int64",
"__exclusive_scan_add_double",
@@ -427,6 +460,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__extract_int32",
"__extract_int64",
"__extract_int8",
"__extract_float",
"__extract_double",
"__fastmath",
"__float_to_half_uniform",
"__float_to_half_varying",
@@ -443,6 +478,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__insert_int32",
"__insert_int64",
"__insert_int8",
"__insert_float",
"__insert_double",
"__intbits_uniform_double",
"__intbits_uniform_float",
"__intbits_varying_double",
@@ -479,6 +516,7 @@ lSetInternalFunctions(llvm::Module *module) {
"__min_varying_uint32",
"__min_varying_uint64",
"__movmsk",
"__movmsk_ptx",
"__new_uniform_32rt",
"__new_uniform_64rt",
"__new_varying32_32rt",
@@ -560,11 +598,13 @@ lSetInternalFunctions(llvm::Module *module) {
"__shuffle_i64",
"__shuffle_i8",
"__soa_to_aos3_float",
"__soa_to_aos3_float1",
"__soa_to_aos3_float16",
"__soa_to_aos3_float4",
"__soa_to_aos3_float8",
"__soa_to_aos3_int32",
"__soa_to_aos4_float",
"__soa_to_aos4_float1",
"__soa_to_aos4_float16",
"__soa_to_aos4_float4",
"__soa_to_aos4_float8",
@@ -622,6 +662,24 @@ lSetInternalFunctions(llvm::Module *module) {
"__vec4_add_int32",
"__vselect_float",
"__vselect_i32",
"__program_index",
"__program_count",
"__warp_index",
"__task_index0",
"__task_index1",
"__task_index2",
"__task_index",
"__task_count0",
"__task_count1",
"__task_count2",
"__task_count",
"__cvt_loc2gen",
"__cvt_loc2gen_var",
"__cvt_const2gen",
"__puts_nvptx",
"ISPCAlloc",
"ISPCLaunch",
"ISPCSync",
};
int count = sizeof(names) / sizeof(names[0]);
@@ -694,6 +752,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
g->target->getISA() != Target::NEON16 &&
g->target->getISA() != Target::NEON8)
#endif // !__arm__
if (g->target->getISA() != Target::NVPTX)
{
Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
mTriple.getArch() == bcTriple.getArch());
@@ -855,7 +914,17 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// Next, add the target's custom implementations of the various needed
// builtin functions (e.g. __masked_store_32(), etc).
switch (g->target->getISA()) {
case Target::NVPTX:
{
if (runtime32) {
fprintf(stderr, "Unforetunatly 32bit targets are supported at the moment .. \n");
assert(0);
}
else {
EXPORT_MODULE(builtins_bitcode_nvptx_64bit);
}
break;
};
#ifdef ISPC_ARM_ENABLED
case Target::NEON8: {
if (runtime32) {
@@ -1125,7 +1194,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
// define the 'programCount' builtin variable
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
if (g->target->getISA() != Target::NVPTX)
{
lDefineConstantInt("programCount", g->target->getVectorWidth(), module, symbolTable);
}
else
{
lDefineConstantInt("programCount", 32, module, symbolTable);
}
// define the 'programIndex' builtin
lDefineProgramIndex(module, symbolTable);
@@ -1155,6 +1231,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
lDefineConstantInt("__have_native_rcpd", g->target->hasRcpd(),
module, symbolTable);
lDefineConstantInt("__is_nvptx_target", (int)(g->target->getISA() == Target::NVPTX),
module, symbolTable);
if (g->forceAlignment != -1) {
llvm::GlobalVariable *alignment = module->getGlobalVariable("memory_alignment", true);
alignment->setInitializer(LLVMInt32(g->forceAlignment));

View File

@@ -0,0 +1,130 @@
#include <cstdio>
#define PRINT_BUF_SIZE 4096
#define uint64_t unsigned long long
static __device__ size_t d_strlen(const char *str)
{
const char *s;
for (s = str; *s; ++s)
;
return (s - str);
}
static __device__ char* d_strncat(char *dest, const char *src, size_t n)
{
size_t dest_len = d_strlen(dest);
size_t i;
for (i = 0 ; i < n && src[i] != '\0' ; i++)
dest[dest_len + i] = src[i];
dest[dest_len + i] = '\0';
return dest;
}
#define APPEND(str) \
do { \
int offset = bufp - &printString[0]; \
*bufp = '\0'; \
d_strncat(bufp, str, PRINT_BUF_SIZE-offset); \
bufp += d_strlen(str); \
if (bufp >= &printString[PRINT_BUF_SIZE]) \
goto done; \
} while (0) /* eat semicolon */
#define PRINT_SCALAR(fmt, type) \
sprintf(tmpBuf, fmt, *((type *)ptr)); \
APPEND(tmpBuf); \
break
#define PRINT_VECTOR(fmt, type) \
*bufp++ = '['; \
if (bufp == &printString[PRINT_BUF_SIZE]) break; \
for (int i = 0; i < width; ++i) { \
/* only print the value if the current lane is executing */ \
type val0 = *((type*)ptr); \
type val = val0; \
if (mask & (1ull<<i)) \
sprintf(tmpBuf, fmt, val); \
else \
sprintf(tmpBuf, "(( * )) "); \
APPEND(tmpBuf); \
*bufp++ = (i != width-1 ? ',' : ']'); \
} \
break
extern "C"
__device__ void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
void **args) {
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
char *bufp = &printString[0];
char tmpBuf[256];
const char trueBuf[] = "true";
const char falseBuf[] = "false";
int argCount = 0;
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
// Format strings are just single percent signs.
if (*format != '%') {
*bufp++ = *format;
}
else {
if (*types) {
void *ptr = args[argCount++];
// Based on the encoding in the types string, cast the
// value appropriately and print it with a reasonable
// printf() formatting string.
switch (*types) {
case 'b': {
const char *tmpBuf1 = *((bool *)ptr) ? trueBuf : falseBuf;
APPEND(tmpBuf1);
break;
}
case 'B': {
*bufp++ = '[';
if (bufp == &printString[PRINT_BUF_SIZE])
break;
for (int i = 0; i < width; ++i) {
bool val0 = *((bool*)ptr);
bool val = val0; \
if (mask & (1ull << i)) {
const char *tmpBuf1 = val ? trueBuf : falseBuf;
APPEND(tmpBuf1);
}
else
APPEND("_________");
*bufp++ = (i != width-1) ? ',' : ']';
}
break;
}
case 'i': PRINT_SCALAR("%d", int);
case 'I': PRINT_VECTOR("%d", int);
case 'u': PRINT_SCALAR("%u", unsigned int);
case 'U': PRINT_VECTOR("%u", unsigned int);
case 'f': PRINT_SCALAR("%f", float);
case 'F': PRINT_VECTOR("%f", float);
case 'l': PRINT_SCALAR("%lld", long long);
case 'L': PRINT_VECTOR("%lld", long long);
case 'v': PRINT_SCALAR("%llu", unsigned long long);
case 'V': PRINT_VECTOR("%llu", unsigned long long);
case 'd': PRINT_SCALAR("%f", double);
case 'D': PRINT_VECTOR("%f", double);
case 'p': PRINT_SCALAR("%p", void *);
case 'P': PRINT_VECTOR("%p", void *);
default:
APPEND("UNKNOWN TYPE ");
*bufp++ = *types;
}
++types;
}
}
++format;
}
done:
*bufp = '\n'; bufp++;
*bufp = '\0';
}

View File

@@ -185,6 +185,81 @@ void __do_print(const char *format, const char *types, int width, uint64_t mask,
fflush(stdout);
}
/* this is print for PTX target only */
int __puts_nvptx(const char *);
void __do_print_nvptx(const char *format, const char *types, int width, uint64_t mask,
void **args) {
#if 0
char printString[PRINT_BUF_SIZE+1]; // +1 for trailing NUL
char *bufp = &printString[0];
char tmpBuf[256];
int argCount = 0;
while (*format && bufp < &printString[PRINT_BUF_SIZE]) {
// Format strings are just single percent signs.
if (*format != '%') {
*bufp++ = *format;
}
else {
if (*types) {
void *ptr = args[argCount++];
// Based on the encoding in the types string, cast the
// value appropriately and print it with a reasonable
// printf() formatting string.
switch (*types) {
case 'b': {
sprintf(tmpBuf, "%s", *((Bool *)ptr) ? "true" : "false");
APPEND(tmpBuf);
break;
}
case 'B': {
*bufp++ = '[';
if (bufp == &printString[PRINT_BUF_SIZE])
break;
for (int i = 0; i < width; ++i) {
if (mask & (1ull << i)) {
sprintf(tmpBuf, "%s", ((Bool *)ptr)[i] ? "true" : "false");
APPEND(tmpBuf);
}
else
APPEND("_________");
*bufp++ = (i != width-1) ? ',' : ']';
}
break;
}
case 'i': PRINT_SCALAR("%d", int);
case 'I': PRINT_VECTOR("%d", int);
case 'u': PRINT_SCALAR("%u", unsigned int);
case 'U': PRINT_VECTOR("%u", unsigned int);
case 'f': PRINT_SCALAR("%f", float);
case 'F': PRINT_VECTOR("%f", float);
case 'l': PRINT_SCALAR("%lld", long long);
case 'L': PRINT_VECTOR("%lld", long long);
case 'v': PRINT_SCALAR("%llu", unsigned long long);
case 'V': PRINT_VECTOR("%llu", unsigned long long);
case 'd': PRINT_SCALAR("%f", double);
case 'D': PRINT_VECTOR("%f", double);
case 'p': PRINT_SCALAR("%p", void *);
case 'P': PRINT_VECTOR("%p", void *);
default:
APPEND("UNKNOWN TYPE ");
*bufp++ = *types;
}
++types;
}
}
++format;
}
done:
*bufp = '\n'; bufp++;
*bufp = '\0';
__puts_nvptx(printString);
#else
__puts_nvptx("---nvptx printing is not support---\n");
#endif
}
int __num_cores() {
#if defined(_MSC_VER) || defined(__MINGW32__)

View File

@@ -288,4 +288,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
;; int8/int16 builtins
define_avgs()
declare_nvptx()

View File

@@ -10,6 +10,7 @@ packed_load_and_store()
scans()
int64minmax()
aossoa()
declare_nvptx()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; masked store

View File

@@ -392,4 +392,4 @@ declare void @__prefetch_read_uniform_nt(i8 * nocapture) nounwind
;; int8/int16 builtins
define_avgs()
declare_nvptx()

View File

@@ -344,3 +344,4 @@ packed_load_and_store(4)
;; prefetch
define_prefetches()
declare_nvptx()

2235
builtins/target-nvptx.ll Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -274,3 +274,4 @@ define i64 @__popcnt_int64(i64) nounwind readnone alwaysinline {
define_avgs()
declare_nvptx()

View File

@@ -278,3 +278,5 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
%call = call i64 @llvm.ctpop.i64(i64 %0)
ret i64 %call
}
declare_nvptx()

3417
builtins/util-nvptx.m4 Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4541,3 +4541,60 @@ define(`rcpd_decl', `
declare double @__rcp_uniform_double(double)
declare <WIDTH x double> @__rcp_varying_double(<WIDTH x double>)
')
define(`declare_nvptx',
`
declare i32 @__program_index() nounwind readnone alwaysinline
declare i32 @__program_count() nounwind readnone alwaysinline
declare i32 @__warp_index() nounwind readnone alwaysinline
declare i32 @__task_index0() nounwind readnone alwaysinline
declare i32 @__task_index1() nounwind readnone alwaysinline
declare i32 @__task_index2() nounwind readnone alwaysinline
declare i32 @__task_index() nounwind readnone alwaysinline
declare i32 @__task_count0() nounwind readnone alwaysinline
declare i32 @__task_count1() nounwind readnone alwaysinline
declare i32 @__task_count2() nounwind readnone alwaysinline
declare i32 @__task_count() nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen(i64 addrspace(3)*) nounwind readnone alwaysinline
declare i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
declare i64* @__cvt_loc2gen_var(i64 addrspace(3)*) nounwind readnone alwaysinline
declare i64 @__movmsk_ptx(<WIDTH x i1>) nounwind readnone alwaysinline;
')
define(`global_atomic_varying',`
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
')
define(`global_atomic_cas_varying',`
declare <$1 x $3> @__atomic_$2_varying_$4_global(<$1 x i64> %ptr, <$1 x $3> %cmp, <$1 x $3> %val, <$1 x MASK> %maskv) nounwind alwaysinline
')
global_atomic_cas_varying(WIDTH, compare_exchange, i32, int32)
global_atomic_cas_varying(WIDTH, compare_exchange, i64, int64)
global_atomic_cas_varying(WIDTH, compare_exchange, float, float)
global_atomic_cas_varying(WIDTH, compare_exchange, double, double)
global_atomic_varying(WIDTH, swap, i32, int32)
global_atomic_varying(WIDTH, swap, i64, int64)
global_atomic_varying(WIDTH, swap, float, float)
global_atomic_varying(WIDTH, swap, double, double)
global_atomic_varying(WIDTH, add, i32, int32)
global_atomic_varying(WIDTH, sub, i32, int32)
global_atomic_varying(WIDTH, and, i32, int32)
global_atomic_varying(WIDTH, or, i32, int32)
global_atomic_varying(WIDTH, xor, i32, int32)
global_atomic_varying(WIDTH, min, i32, int32)
global_atomic_varying(WIDTH, max, i32, int32)
global_atomic_varying(WIDTH, umin, i32, uint32)
global_atomic_varying(WIDTH, umax, i32, uint32)
global_atomic_varying(WIDTH, add, i64, int64)
global_atomic_varying(WIDTH, sub, i64, int64)
global_atomic_varying(WIDTH, and, i64, int64)
global_atomic_varying(WIDTH, or, i64, int64)
global_atomic_varying(WIDTH, xor, i64, int64)
global_atomic_varying(WIDTH, min, i64, int64)
global_atomic_varying(WIDTH, max, i64, int64)
global_atomic_varying(WIDTH, umin, i64, uint64)
global_atomic_varying(WIDTH, umax, i64, uint64)

338
ctx.cpp
View File

@@ -57,6 +57,8 @@
#include <llvm/IR/Instructions.h>
#include <llvm/IR/DerivedTypes.h>
#endif
#include <llvm/Support/raw_ostream.h>
#include <llvm/Support/FormattedStream.h>
/** This is a small utility structure that records information related to one
level of nested control flow. It's mostly used in correctly restoring
@@ -1371,29 +1373,97 @@ FunctionEmitContext::None(llvm::Value *mask) {
llvm::Value *
FunctionEmitContext::LaneMask(llvm::Value *v) {
// Call the target-dependent movmsk function to turn the vector mask
// into an i64 value
std::vector<Symbol *> mm;
m->symbolTable->LookupFunction("__movmsk", &mm);
if (g->target->getMaskBitCount() == 1)
AssertPos(currentPos, mm.size() == 1);
else
// There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
FunctionEmitContext::LaneMask(llvm::Value *v)
{
#if 1 /* this makes mandelbrot example slower, why ?!? */
const char *__movmsk = g->target->getISA() == Target::NVPTX ? "__movmsk_ptx" : "__movmsk";
#else
const char *__movmsk = "__movmsk";
#endif
// Call the target-dependent movmsk function to turn the vector mask
// into an i64 value
std::vector<Symbol *> mm;
m->symbolTable->LookupFunction(__movmsk, &mm);
if (g->target->getMaskBitCount() == 1)
AssertPos(currentPos, mm.size() == 1);
else
// There should be one with signed int signature, one unsigned int.
AssertPos(currentPos, mm.size() == 2);
// We can actually call either one, since both are i32s as far as
// LLVM's type system is concerned...
llvm::Function *fmm = mm[0]->function;
return CallInst(fmm, NULL, v, LLVMGetName(v, "_movmsk"));
}
bool lAppendInsertExtractName(llvm::Value *vector, std::string &funcName)
{
llvm::Type *type = vector->getType();
if (type == LLVMTypes::Int8VectorType)
funcName += "_int8";
else if (type == LLVMTypes::Int16VectorType)
funcName += "_int16";
else if (type == LLVMTypes::Int32VectorType)
funcName += "_int32";
else if (type == LLVMTypes::Int64VectorType)
funcName += "_int64";
else if (type == LLVMTypes::FloatVectorType)
funcName += "_float";
else if (type == LLVMTypes::DoubleVectorType)
funcName += "_double";
else
return false;
return true;
}
llvm::Value*
FunctionEmitContext::Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar)
{
std::string funcName = "__insert";
assert(lAppendInsertExtractName(vector, funcName));
assert(lane->getType() == LLVMTypes::Int32Type);
llvm::Function *func = m->module->getFunction(funcName.c_str());
assert(func != NULL);
std::vector<llvm::Value *> args;
args.push_back(vector);
args.push_back(lane);
args.push_back(scalar);
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
return ret;
}
llvm::Value*
FunctionEmitContext::Extract(llvm::Value *vector, llvm::Value *lane)
{
std::string funcName = "__extract";
assert(lAppendInsertExtractName(vector, funcName));
assert(lane->getType() == LLVMTypes::Int32Type);
llvm::Function *func = m->module->getFunction(funcName.c_str());
assert(func != NULL);
std::vector<llvm::Value *> args;
args.push_back(vector);
args.push_back(lane);
llvm::Value *ret = llvm::CallInst::Create(func, args, LLVMGetName(vector, funcName.c_str()), GetCurrentBasicBlock());
return ret;
}
llvm::Value *
FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
if (g->target->getISA() == Target::NVPTX)
{
// Compare the two masks to get a vector of i1s
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
v1, v2, "v1==v2");
return ExtractInst(cmp, 0); /* this works without calling All(..) in PTX. Why ?!? */
}
else
{
#if 0
// Compare the two masks to get a vector of i1s
llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
v1, v2, "v1==v2");
v1, v2, "v1==v2");
// Turn that into a bool vector type (often i32s)
cmp = I1VecToBoolVec(cmp);
// And see if it's all on
@@ -1402,22 +1472,34 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
llvm::Value *mm1 = LaneMask(v1);
llvm::Value *mm2 = LaneMask(v2);
return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mm1, mm2,
LLVMGetName("equal", v1, v2));
LLVMGetName("equal", v1, v2));
#endif
}
}
llvm::Value *
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
llvm::SmallVector<llvm::Constant*, 16> array;
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
array.push_back(C);
llvm::Constant *C = is32bits ? LLVMInt32(i) : LLVMInt64(i);
array.push_back(C);
}
llvm::Constant* index = llvm::ConstantVector::get(array);
return index;
}
llvm::Value *
FunctionEmitContext::ProgramIndexVectorPTX(bool is32bits) {
llvm::Function *func_program_index = m->module->getFunction("__program_index");
llvm::Value *__program_index = CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__program_indexS");
llvm::Value *index = InsertInst(llvm::UndefValue::get(LLVMTypes::Int32VectorType), __program_index, 0, "foreach__program_indexV");
#if 0
if (!is32bits)
index = ZExtInst(index, LLVMTypes::Int64VectandType);
#endif
return index;
}
llvm::Value *
@@ -1830,6 +1912,7 @@ FunctionEmitContext::PtrToIntInst(llvm::Value *value, const char *name) {
if (name == NULL)
name = LLVMGetName(value, "_ptr2int");
llvm::Type *type = LLVMTypes::PointerIntType;
llvm::Instruction *inst = new llvm::PtrToIntInst(value, type, name, bblock);
AddDebugPos(inst);
@@ -3523,98 +3606,199 @@ llvm::Value *
FunctionEmitContext::LaunchInst(llvm::Value *callee,
std::vector<llvm::Value *> &argVals,
llvm::Value *launchCount[3]){
if (callee == NULL) {
if (g->target->getISA() != Target::NVPTX)
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
return NULL;
}
}
launchedTasks = true;
launchedTasks = true;
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
llvm::Type *argType =
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
llvm::Type *argType =
(llvm::dyn_cast<llvm::Function>(callee))->arg_begin()->getType();
AssertPos(currentPos, llvm::PointerType::classof(argType));
llvm::PointerType *pt =
AssertPos(currentPos, llvm::PointerType::classof(argType));
llvm::PointerType *pt =
llvm::dyn_cast<llvm::PointerType>(argType);
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
llvm::StructType *argStructType =
AssertPos(currentPos, llvm::StructType::classof(pt->getElementType()));
llvm::StructType *argStructType =
static_cast<llvm::StructType *>(pt->getElementType());
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
// ISPCAlloc expects the size as an uint64_t, but on 32-bit
// targets, SizeOf returns a 32-bit value
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
"struct_size_to_64");
int align = 4 * RoundUpPow2(g->target->getNativeVectorWidth());
std::vector<llvm::Value *> allocArgs;
allocArgs.push_back(launchGroupHandlePtr);
allocArgs.push_back(structSize);
allocArgs.push_back(LLVMInt32(align));
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
llvm::Value *argmem = BitCastInst(voidmem, pt);
std::vector<llvm::Value *> allocArgs;
allocArgs.push_back(launchGroupHandlePtr);
allocArgs.push_back(structSize);
allocArgs.push_back(LLVMInt32(align));
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
llvm::Value *argmem = BitCastInst(voidmem, pt);
// Copy the values of the parameters into the appropriate place in
// the argument block
for (unsigned int i = 0; i < argVals.size(); ++i) {
// Copy the values of the parameters into the appropriate place in
// the argument block
for (unsigned int i = 0; i < argVals.size(); ++i) {
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
}
}
if (argStructType->getNumElements() == argVals.size() + 1) {
if (argStructType->getNumElements() == argVals.size() + 1) {
// copy in the mask
llvm::Value *mask = GetFullMask();
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
"funarg_mask");
"funarg_mask");
StoreInst(mask, ptr);
}
}
// And emit the call to the user-supplied task launch function, passing
// a pointer to the task function being called and a pointer to the
// argument block we just filled in
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
AssertPos(currentPos, flaunch != NULL);
std::vector<llvm::Value *> args;
args.push_back(launchGroupHandlePtr);
args.push_back(fptr);
args.push_back(voidmem);
args.push_back(launchCount[0]);
args.push_back(launchCount[1]);
args.push_back(launchCount[2]);
return CallInst(flaunch, NULL, args, "");
// And emit the call to the user-supplied task launch function, passing
// a pointer to the task function being called and a pointer to the
// argument block we just filled in
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
AssertPos(currentPos, flaunch != NULL);
std::vector<llvm::Value *> args;
args.push_back(launchGroupHandlePtr);
args.push_back(fptr);
args.push_back(voidmem);
args.push_back(launchCount[0]);
args.push_back(launchCount[1]);
args.push_back(launchCount[2]);
return CallInst(flaunch, NULL, args, "");
}
else /* NVPTX */
{
if (callee == NULL) {
AssertPos(currentPos, m->errorCount > 0);
return NULL;
}
launchedTasks = true;
AssertPos(currentPos, llvm::isa<llvm::Function>(callee));
std::vector<llvm::Type*> argTypes;
llvm::Function *F = llvm::dyn_cast<llvm::Function>(callee);
const unsigned int nArgs = F->arg_size();
llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
for (; I != E; ++I)
argTypes.push_back(I->getType());
llvm::Type *st = llvm::StructType::get(*g->ctx, argTypes);
llvm::StructType *argStructType = static_cast<llvm::StructType *>(st);
llvm::Value *structSize = g->target->SizeOf(argStructType, bblock);
if (structSize->getType() != LLVMTypes::Int64Type)
structSize = ZExtInst(structSize, LLVMTypes::Int64Type,
"struct_size_to_64");
const int align = 8;
llvm::Function *falloc = m->module->getFunction("ISPCAlloc");
AssertPos(currentPos, falloc != NULL);
std::vector<llvm::Value *> allocArgs;
allocArgs.push_back(launchGroupHandlePtr);
allocArgs.push_back(structSize);
allocArgs.push_back(LLVMInt32(align));
llvm::Value *voidmem = CallInst(falloc, NULL, allocArgs, "args_ptr");
llvm::Value *voidi64 = PtrToIntInst(voidmem, "args_i64");
llvm::BasicBlock* if_true = CreateBasicBlock("if_true");
llvm::BasicBlock* if_false = CreateBasicBlock("if_false");
/* check if the pointer returned by ISPCAlloc is not NULL
* --------------
* this is a workaround for not checking the value of programIndex
* because ISPCAlloc will return NULL pointer for all programIndex > 0
* of course, if ISPAlloc fails to get parameter buffer, the pointer for programIndex = 0
* will also be NULL
* This check must be added, and also rewrite the code to make it less opaque
*/
llvm::Value* cmp1 = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, voidi64, LLVMInt64(0), "cmp1");
BranchInst(if_true, if_false, cmp1);
/**********************/
bblock = if_true;
// label_if_then block:
llvm::Type *pt = llvm::PointerType::getUnqual(st);
llvm::Value *argmem = BitCastInst(voidmem, pt);
for (unsigned int i = 0; i < argVals.size(); ++i)
{
llvm::Value *ptr = AddElementOffset(argmem, i, NULL, "funarg");
// don't need to do masked store here, I think
StoreInst(argVals[i], ptr);
}
if (nArgs == argVals.size() + 1) {
// copy in the mask
llvm::Value *mask = GetFullMask();
llvm::Value *ptr = AddElementOffset(argmem, argVals.size(), NULL,
"funarg_mask");
StoreInst(mask, ptr);
}
BranchInst(if_false);
/**********************/
bblock = if_false;
llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType);
llvm::Function *flaunch = m->module->getFunction("ISPCLaunch");
AssertPos(currentPos, flaunch != NULL);
std::vector<llvm::Value *> args;
args.push_back(launchGroupHandlePtr);
args.push_back(fptr);
args.push_back(voidmem);
args.push_back(launchCount[0]);
args.push_back(launchCount[1]);
args.push_back(launchCount[2]);
llvm::Value *ret = CallInst(flaunch, NULL, args, "");
return ret;
}
}
void
FunctionEmitContext::SyncInst() {
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
if (g->target->getISA() != Target::NVPTX)
{
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
llvm::CmpInst::ICMP_NE,
launchGroupHandle, nullPtrValue);
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
BranchInst(bSync, bPostSync, nonNull);
llvm::Value *nonNull = CmpInst(llvm::Instruction::ICmp,
llvm::CmpInst::ICMP_NE,
launchGroupHandle, nullPtrValue);
llvm::BasicBlock *bSync = CreateBasicBlock("call_sync");
llvm::BasicBlock *bPostSync = CreateBasicBlock("post_sync");
BranchInst(bSync, bPostSync, nonNull);
SetCurrentBasicBlock(bSync);
llvm::Function *fsync = m->module->getFunction("ISPCSync");
if (fsync == NULL)
SetCurrentBasicBlock(bSync);
llvm::Function *fsync = m->module->getFunction("ISPCSync");
if (fsync == NULL)
FATAL("Couldn't find ISPCSync declaration?!");
CallInst(fsync, NULL, launchGroupHandle, "");
CallInst(fsync, NULL, launchGroupHandle, "");
// zero out the handle so that if ISPCLaunch is called again in this
// function, it knows it's starting out from scratch
StoreInst(nullPtrValue, launchGroupHandlePtr);
// zero out the handle so that if ISPCLaunch is called again in this
// function, it knows it's starting out from scratch
StoreInst(nullPtrValue, launchGroupHandlePtr);
BranchInst(bPostSync);
BranchInst(bPostSync);
SetCurrentBasicBlock(bPostSync);
SetCurrentBasicBlock(bPostSync);
}
else /* NVPTX: don't do test, just call sync */
{
llvm::Value *launchGroupHandle = LoadInst(launchGroupHandlePtr);
llvm::Value *nullPtrValue =
llvm::Constant::getNullValue(LLVMTypes::VoidPointerType);
llvm::Function *fsync = m->module->getFunction("ISPCSync");
if (fsync == NULL)
FATAL("Couldn't find ISPCSync declaration?!");
CallInst(fsync, NULL, launchGroupHandle, "");
StoreInst(nullPtrValue, launchGroupHandlePtr);
}
}

8
ctx.h
View File

@@ -291,6 +291,13 @@ public:
of the mask is on. */
llvm::Value *LaneMask(llvm::Value *mask);
/** Issues a call to __insert_int8/int16/int32/int64/float/double */
llvm::Value* Insert(llvm::Value *vector, llvm::Value *lane, llvm::Value *scalar);
/** Issues a call to __extract_int8/int16/int32/int64/float/double */
llvm::Value* Extract(llvm::Value *vector, llvm::Value *lane);
/** Given two masks of type LLVMTypes::MaskType, return an i1 value
that indicates whether the two masks are equal. */
llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2);
@@ -298,6 +305,7 @@ public:
/** Generate ConstantVector, which contains ProgramIndex, i.e.
< i32 0, i32 1, i32 2, i32 3> */
llvm::Value *ProgramIndexVector(bool is32bits = true);
llvm::Value *ProgramIndexVectorPTX(bool is32bits = true);
/** Given a string, create an anonymous global variable to hold its
value and return the pointer to the string. */

View File

@@ -168,6 +168,13 @@ DeclSpecs::GetBaseType(SourcePos pos) const {
retType = lApplyTypeQualifiers(typeQualifiers, retType, pos);
if (soaWidth > 0) {
#if 0 /* see stmt.cpp in DeclStmt::EmitCode for work-around of SOAType Declaration */
if (g->target->getISA() == Target::NVPTX)
{
Error(pos, "\"soa\" data types are currently not supported with \"nvptx\" target.");
return NULL;
}
#endif
const StructType *st = CastType<StructType>(retType);
if (st == NULL) {
@@ -402,6 +409,13 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
return;
}
#if 0 /* NVPTX */
if (baseType->IsUniformType())
{
fprintf(stderr, " detected uniform array of size= %d array= %s\n" ,arraySize,
baseType->IsArrayType() ? " true " : " false ");
}
#endif
const Type *arrayType = new ArrayType(baseType, arraySize);
if (child != NULL) {
child->InitFromType(arrayType, ds);
@@ -530,9 +544,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
returnType = returnType->ResolveUnboundVariability(Variability::Varying);
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
bool isExternC = ds && (ds->storageClass == SC_EXTERN_C);
bool isExported = ds && ((ds->typeQualifiers & TYPEQUAL_EXPORT) != 0);
bool isTask = ds && ((ds->typeQualifiers & TYPEQUAL_TASK) != 0);
bool isUnmasked = ds && ((ds->typeQualifiers & TYPEQUAL_UNMASKED) != 0);
if (isExported && isTask) {
@@ -541,9 +555,9 @@ Declarator::InitFromType(const Type *baseType, DeclSpecs *ds) {
return;
}
if (isExternC && isTask) {
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
"qualifiers");
return;
Error(pos, "Function can't have both \"extern \"C\"\" and \"task\" "
"qualifiers");
return;
}
if (isExternC && isExported) {
Error(pos, "Function can't have both \"extern \"C\"\" and \"export\" "

View File

@@ -7867,6 +7867,12 @@ SizeOfExpr::TypeCheck() {
"struct type \"%s\".", type->GetString().c_str());
return NULL;
}
if (type != NULL)
if (g->target->getISA() == Target::NVPTX && type->IsVaryingType())
{
Error(pos, "\"sizeof\" with varying data types is not yet supported with \"nvptx\" target.");
return NULL;
}
return this;
}
@@ -8661,6 +8667,11 @@ NewExpr::TypeCheck() {
AssertPos(pos, m->errorCount > 0);
return NULL;
}
if (g->target->getISA() == Target::NVPTX && allocType->IsVaryingType())
{
Error(pos, "\"new\" with varying data types is not yet supported with \"nvptx\" target.");
return NULL;
}
if (CastType<UndefinedStructType>(allocType) != NULL) {
Error(pos, "Can't dynamically allocate storage for declared "
"but not defined type \"%s\".", allocType->GetString().c_str());

View File

@@ -47,6 +47,7 @@
#include <stdio.h>
#if defined(LLVM_3_1) || defined(LLVM_3_2)
#include <llvm/Metadata.h>
#include <llvm/LLVMContext.h>
#include <llvm/Module.h>
#include <llvm/Type.h>
@@ -54,6 +55,7 @@
#include <llvm/Intrinsics.h>
#include <llvm/DerivedTypes.h>
#else
#include <llvm/IR/Metadata.h>
#include <llvm/IR/LLVMContext.h>
#include <llvm/IR/Module.h>
#include <llvm/IR/Type.h>
@@ -128,7 +130,7 @@ Function::Function(Symbol *s, Stmt *c) {
sym->parentFunction = this;
}
if (type->isTask) {
if (type->isTask && g->target->getISA() != Target::NVPTX) {
threadIndexSym = m->symbolTable->LookupVariable("threadIndex");
Assert(threadIndexSym);
threadCountSym = m->symbolTable->LookupVariable("threadCount");
@@ -239,7 +241,7 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
#endif
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isTask == true) {
if (type->isTask == true && g->target->getISA() != Target::NVPTX) {
// For tasks, there should always be three parameters: the
// pointer to the structure that holds all of the arguments, the
// thread index, and the thread count variables.
@@ -337,6 +339,16 @@ Function::emitCode(FunctionEmitContext *ctx, llvm::Function *function,
ctx->SetFunctionMask(argIter);
Assert(++argIter == function->arg_end());
}
if (type->isTask == true && g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(function);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(LLVMInt32(1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
}
}
// Finally, we can generate code for the function
@@ -492,13 +504,28 @@ Function::GenerateIR() {
// the application can call it
const FunctionType *type = CastType<FunctionType>(sym->type);
Assert(type != NULL);
if (type->isExported) {
if (type->isExported) {
if (!type->isTask) {
llvm::FunctionType *ftype = type->LLVMFunctionType(g->ctx, true);
llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::ExternalLinkage;
std::string functionName = sym->name;
if (g->mangleFunctionsWithTarget)
functionName += std::string("_") + g->target->GetISAString();
if (g->target->getISA() == Target::NVPTX)
{
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
#if 0
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(function);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
#endif
}
llvm::Function *appFunction =
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);
#if defined(LLVM_3_1)
@@ -538,6 +565,16 @@ Function::GenerateIR() {
FATAL("Function verificication failed");
}
}
if (g->target->getISA() == Target::NVPTX)
{
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
av.push_back(appFunction);
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
}
}
}
}

View File

@@ -280,6 +280,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
arch = "arm";
else
#endif
if(!strncmp(isa, "nvptx", 5))
arch = "nvptx64";
else
arch = "x86-64";
}
@@ -707,6 +710,19 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskBitCount = 32;
}
#endif
else if (!strcasecmp(isa, "nvptx"))
{
this->m_isa = Target::NVPTX;
this->m_cpu = "sm_35";
this->m_nativeVectorWidth = 32;
this->m_nativeVectorAlignment = 32;
this->m_vectorWidth = 1;
this->m_hasHalf = true;
this->m_maskingIsFree = true;
this->m_maskBitCount = 1;
this->m_hasTranscendentals = false;
this->m_hasGather = this->m_hasScatter = false;
}
else {
Error(SourcePos(), "Target \"%s\" is unknown. Choices are: %s.",
isa, SupportedTargets());
@@ -784,7 +800,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
// Initialize target-specific "target-feature" attribute.
if (!m_attributes.empty()) {
llvm::AttrBuilder attrBuilder;
attrBuilder.addAttribute("target-cpu", this->m_cpu);
if (m_isa != Target::NVPTX)
attrBuilder.addAttribute("target-cpu", this->m_cpu);
attrBuilder.addAttribute("target-features", this->m_attributes);
this->m_tf_attributes = new llvm::AttributeSet(
llvm::AttributeSet::get(
@@ -839,7 +856,7 @@ Target::SupportedTargets() {
"avx1.1-i32x8, avx1.1-i32x16, avx1.1-i64x4 "
"avx2-i32x8, avx2-i32x16, avx2-i64x4, "
"generic-x1, generic-x4, generic-x8, generic-x16, "
"generic-x32, generic-x64";
"generic-x32, generic-x64, nvptx";
}
@@ -866,6 +883,8 @@ Target::GetTripleString() const {
triple.setArchName("i386");
else if (m_arch == "x86-64")
triple.setArchName("x86_64");
else if (m_arch == "nvptx64")
triple = llvm::Triple("nvptx64", "nvidia", "cuda");
else
triple.setArchName(m_arch);
}
@@ -898,6 +917,8 @@ Target::ISAToString(ISA isa) {
return "avx2";
case Target::GENERIC:
return "generic";
case Target::NVPTX:
return "nvptx";
default:
FATAL("Unhandled target in ISAToString()");
}
@@ -936,6 +957,8 @@ Target::ISAToTargetString(ISA isa) {
return "avx2-i32x8";
case Target::GENERIC:
return "generic-4";
case Target::NVPTX:
return "nvptx";
default:
FATAL("Unhandled target in ISAToTargetString()");
}

3
ispc.h
View File

@@ -179,7 +179,7 @@ public:
flexible/performant of them will apear last in the enumerant. Note
also that __best_available_isa() needs to be updated if ISAs are
added or the enumerant values are reordered. */
enum ISA {
enum ISA { NVPTX,
#ifdef ISPC_ARM_ENABLED
NEON32, NEON16, NEON8,
#endif
@@ -606,6 +606,7 @@ struct Globals {
/** Indicates that alignment in memory allocation routines should be
forced to have given value. -1 value means natural alignment for the platforms. */
int forceAlignment;
std::string PtxString;
};
enum {

View File

@@ -320,6 +320,11 @@ int main(int Argc, char *Argv[]) {
LLVMInitializeARMTargetMC();
#endif
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXAsmPrinter();
LLVMInitializeNVPTXTargetMC();
char *file = NULL;
const char *headerFileName = NULL;
const char *outFileName = NULL;

View File

@@ -444,6 +444,38 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
return;
}
if (g->target->getISA() == Target::NVPTX &&
#if 0
!type->IsConstType() &&
#endif
#if 1
at != NULL &&
#endif
type->IsVaryingType())
{
Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
return;
#if 0
int nel = 32; /* warp-size */
if (type->IsArrayType())
{
const ArrayType *at = CastType<ArrayType>(type);
/* we must scale # elements by 4, because a thread-block will run 4 warps
* or 128 threads.
* ***note-to-me***:please define these value (128threads/4warps)
* in nvptx-target definition
* instead of compile-time constants
*/
nel *= at->GetElementCount();
assert (!type->IsSOAType());
type = new ArrayType(at->GetElementType()->GetAsUniformType(), nel);
}
else
type = new ArrayType(type->GetAsUniformType(), nel);
#endif
}
llvm::Type *llvmType = type->LLVMType(g->ctx);
if (llvmType == NULL)
return;
@@ -643,6 +675,21 @@ lCheckExportedParameterTypes(const Type *type, const std::string &name,
}
}
static void
lCheckTaskParameterTypes(const Type *type, const std::string &name,
SourcePos pos) {
if (g->target->getISA() != Target::NVPTX)
return;
if (lRecursiveCheckValidParamType(type, false) == false) {
if (CastType<VectorType>(type))
Error(pos, "Vector-typed parameter \"%s\" is illegal in a task "
"function with \"nvptx\" target.", name.c_str());
else
Error(pos, "Varying parameter \"%s\" is illegal in a task function with \"nvptx\" target.",
name.c_str());
}
}
/** Given a function type, loop through the function parameters and see if
any are StructTypes. If so, issue an error; this is currently broken
@@ -801,7 +848,8 @@ Module::AddFunctionDeclaration(const std::string &name,
#else // LLVM 3.1 and 3.3+
function->addFnAttr(llvm::Attribute::AlwaysInline);
#endif
if (functionType->isTask)
/* evghenii: fails function verification when "if" executed in nvptx target */
if (functionType->isTask && g->target->getISA() != Target::NVPTX)
// This also applies transitively to members I think?
#if defined(LLVM_3_1)
function->setDoesNotAlias(1, true);
@@ -822,6 +870,13 @@ Module::AddFunctionDeclaration(const std::string &name,
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false)
Error(pos, "Task-qualified functions must have void return type.");
if (g->target->getISA() == Target::NVPTX &&
Type::Equal(functionType->GetReturnType(), AtomicType::Void) == false &&
functionType->isExported)
{
Error(pos, "Export-qualified functions must have void return type with \"nvptx\" target.");
}
if (functionType->isExported || functionType->isExternC)
lCheckForStructParameters(functionType, pos);
@@ -841,6 +896,9 @@ Module::AddFunctionDeclaration(const std::string &name,
if (functionType->isExported) {
lCheckExportedParameterTypes(argType, argName, argPos);
}
if (functionType->isTask) {
lCheckTaskParameterTypes(argType, argName, argPos);
}
// ISPC assumes that no pointers alias. (It should be possible to
// specify when this is not the case, but this should be the
@@ -959,7 +1017,13 @@ Module::writeOutput(OutputType outputType, const char *outFileName,
const char *fileType = NULL;
switch (outputType) {
case Asm:
if (strcasecmp(suffix, "s"))
if (g->target->getISA() != Target::NVPTX)
{
if (strcasecmp(suffix, "s"))
fileType = "assembly";
}
else
if (strcasecmp(suffix, "ptx"))
fileType = "assembly";
break;
case Bitcode:
@@ -1057,6 +1121,11 @@ Module::writeBitcode(llvm::Module *module, const char *outFileName) {
}
llvm::raw_fd_ostream fos(fd, (fd != 1), false);
if (g->target->getISA() == Target::NVPTX)
{
const std::string dl_string = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
module->setDataLayout(dl_string);
}
llvm::WriteBitcodeToFile(module, fos);
return true;
}
@@ -2095,6 +2164,24 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
opts.addMacroDef(g->cppArgs[i].substr(2));
}
}
if (g->target->getISA() == Target::NVPTX)
{
opts.addMacroDef("__NVPTX__");
opts.addMacroDef("programIndex=__programIndex()");
opts.addMacroDef("cif=if");
opts.addMacroDef("cfor=for");
opts.addMacroDef("cwhile=while");
opts.addMacroDef("ccontinue=continue");
opts.addMacroDef("cdo=do");
opts.addMacroDef("taskIndex0=__taskIndex0()");
opts.addMacroDef("taskIndex1=__taskIndex1()");
opts.addMacroDef("taskIndex2=__taskIndex2()");
opts.addMacroDef("taskIndex=__taskIndex()");
opts.addMacroDef("taskCount0=__taskCount0()");
opts.addMacroDef("taskCount1=__taskCount1()");
opts.addMacroDef("taskCount2=__taskCount2()");
opts.addMacroDef("taskCount=__taskCount()");
}
#if defined(LLVM_3_1)
inst.getLangOpts().BCPLComment = 1;
@@ -2540,6 +2627,29 @@ lCreateDispatchModule(std::map<std::string, FunctionTargetVariants> &functions)
return module;
}
static std::string lCBEMangle(const std::string &S) {
std::string Result;
for (unsigned i = 0, e = S.size(); i != e; ++i) {
if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
(S[i] == '<' && S[i+1] == '<'))) {
Result += '_';
Result += 'A'+(S[i]&15);
Result += 'A'+((S[i]>>4)&15);
Result += '_';
i++;
} else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
Result += S[i];
} else {
Result += '_';
Result += 'A'+(S[i]&15);
Result += 'A'+((S[i]>>4)&15);
Result += '_';
}
}
return Result;
}
int
Module::CompileAndOutput(const char *srcFile,
@@ -2555,7 +2665,7 @@ Module::CompileAndOutput(const char *srcFile,
const char *hostStubFileName,
const char *devStubFileName)
{
if (target == NULL || strchr(target, ',') == NULL) {
if (target == NULL || strchr(target, ',') == NULL) {
// We're only compiling to a single target
g->target = new Target(arch, cpu, target, generatePIC);
if (!g->target->isValid())
@@ -2563,6 +2673,32 @@ Module::CompileAndOutput(const char *srcFile,
m = new Module(srcFile);
if (m->CompileFile() == 0) {
/* NVPTX:
* for PTX target replace '.' with '_' in all global variables
* a PTX identifier name must match [a-zA-Z$_][a-zA-Z$_0-9]*
*/
if (g->target->getISA() == Target::NVPTX)
{
/* mangle global variables names */
{
llvm::Module::global_iterator I = m->module->global_begin(), E = m->module->global_end();
for (; I != E; I++)
I->setName(lCBEMangle(I->getName()));
}
/* mangle functions names */
{
llvm::Module::iterator I = m->module->begin(), E = m->module->end();
for (; I != E; I++)
{
std::string str = I->getName();
if (str.find("operator") != std::string::npos)
I->setName(lCBEMangle(str));
}
}
}
if (outputType == CXX) {
if (target == NULL || strncmp(target, "generic-", 8) != 0) {
Error(SourcePos(), "When generating C++ output, one of the \"generic-*\" "
@@ -2765,4 +2901,5 @@ Module::CompileAndOutput(const char *srcFile,
return errorCount > 0;
}
return true;
}

20
nvptxcc Executable file
View File

@@ -0,0 +1,20 @@
#!/bin/sh
PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
PTXCC=ptxcc
ARGS=${@:2}
if [ "$NVVM" == "1" ];
then
LLVM32=$HOME/usr/local/llvm/bin-3.2
LLVMDIS=$LLVM32/bin/llvm-dis
PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
$($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
$($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
else
$($PTXCC $1 -o $1.o -Xnvcc="-G") && \
$(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
fi

178
opt.cpp
View File

@@ -133,6 +133,7 @@ static llvm::Pass *CreateDebugPass(char * output);
static llvm::Pass *CreateReplaceStdlibShiftPass();
static llvm::Pass *CreateFixBooleanSelectPass();
static llvm::Pass *CreatePromoteLocalToPrivatePass();
#define DEBUG_START_PASS(NAME) \
if (g->debugPrint && \
@@ -496,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
// run absolutely no optimizations, since the front-end needs us to
// take the various __pseudo_* functions it has emitted and turn
// them into something that can actually execute.
optPM.add(CreateImproveMemoryOpsPass(), 100);
if (g->opt.disableGatherScatterOptimizations == false &&
g->target->getVectorWidth() > 1)
optPM.add(CreateImproveMemoryOpsPass(), 100);
if (g->opt.disableHandlePseudoMemoryOps == false)
optPM.add(CreateReplacePseudoMemoryOpsPass());
@@ -519,6 +524,8 @@ Optimize(llvm::Module *module, int optLevel) {
llvm::initializeInstrumentation(*registry);
llvm::initializeTarget(*registry);
if (g->target->getISA() == Target::NVPTX)
optPM.add(CreatePromoteLocalToPrivatePass());
optPM.add(llvm::createGlobalDCEPass(), 185);
// Setup to use LLVM default AliasAnalysis
@@ -577,7 +584,10 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createGlobalOptimizerPass());
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createIPConstantPropagationPass());
optPM.add(CreateReplaceStdlibShiftPass(),229);
if (g->target->getISA() != Target::NVPTX)
optPM.add(CreateReplaceStdlibShiftPass(),229);
optPM.add(llvm::createDeadArgEliminationPass(),230);
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
@@ -689,6 +699,111 @@ Optimize(llvm::Module *module, int optLevel) {
// Should be the last
optPM.add(CreateFixBooleanSelectPass(), 400);
if (g->target->getISA() == Target::NVPTX)
{
optPM.add(llvm::createGlobalDCEPass());
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
optPM.add(llvm::createBasicAliasAnalysisPass());
optPM.add(llvm::createCFGSimplificationPass());
// Here clang has an experimental pass SROAPass instead of
// ScalarReplAggregatesPass. We should add it in the future.
optPM.add(llvm::createScalarReplAggregatesPass());
optPM.add(llvm::createEarlyCSEPass());
optPM.add(llvm::createLowerExpectIntrinsicPass());
optPM.add(llvm::createTypeBasedAliasAnalysisPass());
optPM.add(llvm::createBasicAliasAnalysisPass());
// Early optimizations to try to reduce the total amount of code to
// work with if we can
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createDeadInstEliminationPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createPromoteMemoryToRegisterPass());
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createDeadInstEliminationPass());
// On to more serious optimizations
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createPromoteMemoryToRegisterPass());
optPM.add(llvm::createGlobalOptimizerPass());
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createIPConstantPropagationPass());
optPM.add(llvm::createDeadArgEliminationPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createPruneEHPass());
optPM.add(llvm::createFunctionAttrsPass());
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createDeadInstEliminationPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createArgumentPromotionPass());
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
// Starting from 3.4 this functionality was moved to
// InstructionCombiningPass. See r184459 for details.
optPM.add(llvm::createSimplifyLibCallsPass());
#endif
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createJumpThreadingPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createTailCallEliminationPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createConstantPropagationPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createIPSCCPPass());
optPM.add(llvm::createDeadArgEliminationPass());
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createArgumentPromotionPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createReassociatePass());
optPM.add(llvm::createLoopRotatePass());
optPM.add(llvm::createLICMPass());
// optPM.add(llvm::createLoopUnswitchPass(false));
#if 1
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createIndVarSimplifyPass());
optPM.add(llvm::createLoopIdiomPass());
optPM.add(llvm::createLoopDeletionPass());
optPM.add(llvm::createLoopUnrollPass());
optPM.add(llvm::createGVNPass());
optPM.add(llvm::createMemCpyOptPass());
optPM.add(llvm::createSCCPPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createJumpThreadingPass());
optPM.add(llvm::createCorrelatedValuePropagationPass());
optPM.add(llvm::createDeadStoreEliminationPass());
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createStripDeadPrototypesPass());
optPM.add(llvm::createGlobalDCEPass());
optPM.add(llvm::createConstantMergePass());
#endif
}
}
// Finish up by making sure we didn't mess anything up in the IR along
@@ -5267,4 +5382,63 @@ CreateFixBooleanSelectPass() {
return new FixBooleanSelectPass();
}
///////////////////////////////////////////////////////////////////////////////
// Detect addrspace(3)
///////////////////////////////////////////////////////////////////////////////
class PromoteLocalToPrivatePass: public llvm::BasicBlockPass
{
public:
static char ID; // Pass identification, replacement for typeid
PromoteLocalToPrivatePass() : BasicBlockPass(ID) {}
bool runOnBasicBlock(llvm::BasicBlock &BB);
};
char PromoteLocalToPrivatePass::ID = 0;
bool
PromoteLocalToPrivatePass::runOnBasicBlock(llvm::BasicBlock &BB)
{
std::vector<llvm::AllocaInst*> Allocas;
bool modifiedAny = false;
llvm::Function *cvtFunc = m->module->getFunction("__cvt_loc2gen_var");
// Find allocas that are safe to promote, by looking at all instructions in
// the entry node
for (llvm::BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
{
llvm::Instruction *inst = &*I;
if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(inst))
{
llvm::Function *func = ci->getCalledFunction();
if (cvtFunc && (cvtFunc == func))
{
#if 0
fprintf(stderr , "--found cvt-- name= %s \n",
I->getName().str().c_str());
#endif
llvm::AllocaInst *alloca = new llvm::AllocaInst(LLVMTypes::Int64Type, "opt_loc2var", ci);
assert(alloca != NULL);
#if 0
const int align = 8; // g->target->getNativeVectorAlignment();
alloca->setAlignment(align);
#endif
ci->replaceAllUsesWith(alloca);
modifiedAny = true;
}
}
}
return modifiedAny;
}
static llvm::Pass *
CreatePromoteLocalToPrivatePass() {
return new PromoteLocalToPrivatePass();
}

14
ptxtestcc.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/sh
LLC=$HOME/usr/local/llvm/bin-trunk/bin/llc
DIS=$HOME/usr/local/llvm/bin-3.2/bin/llvm-dis
ISPC=ispc
PTXCC=ptxcc
PTXGEN=~/ptxgen
$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$LLC -march=nvptx64 -mcpu=sm_35 -o $1.ptx) && \
#$(cat $1 |grep -v 'width'|$ISPC --target=nvptx --emit-llvm -o -|$DIS -o $1_32_ptx.ll && $PTXGEN $1_32_ptx.ll > $1.ptx) && \
$($PTXCC $1.ptx -Xptxas=-v -o $1.ptx.o) && \
nvcc -o test_nvptx test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.ptx.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt -DTEST_SIG=$2

View File

@@ -204,6 +204,8 @@ def run_test(testname):
return (1, 0)
else:
global is_generic_target
global is_nvptx_target
global is_nvptx_nvvm
if is_windows:
if is_generic_target:
obj_name = "%s.cpp" % os.path.basename(filename)
@@ -218,6 +220,13 @@ def run_test(testname):
else:
if is_generic_target:
obj_name = "%s.cpp" % testname
elif is_nvptx_target:
if os.environ.get("NVVM") == "1":
is_nvptx_nvvm = True
obj_name = "%s.bc" % testname
else:
obj_name = "%s.ptx" % testname
is_nvptx_nvvm = False
else:
obj_name = "%s.o" % testname
exe_name = "%s.run" % testname
@@ -248,13 +257,32 @@ def run_test(testname):
cc_cmd += ' -Wl,-no_pie'
if should_fail:
cc_cmd += " -DEXPECT_FAILURE"
if is_nvptx_target:
nvptxcc_exe = "nvptxcc"
nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
(nvptxcc_exe_rel, obj_name, match, exe_name)
ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
(filename, obj_name, options.arch, options.target)
if options.no_opt:
ispc_cmd += " -O0"
if is_generic_target:
ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
if is_nvptx_target:
filename4ptx = filename+".ptx.parsed_ispc"
grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
(filename, filename4ptx)
if options.verbose:
print "Grepping: %s" % grep_cmd
sp = subprocess.Popen(grep_cmd, shell=True)
sp.communicate()
if is_nvptx_nvvm:
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-llvm --target=%s" % \
(filename4ptx, obj_name, options.target)
else:
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
(filename4ptx, obj_name, options.target)
# compile the ispc code, make the executable, and run it...
(compile_error, run_error) = run_cmds([ispc_cmd, cc_cmd],
@@ -269,7 +297,7 @@ def run_test(testname):
basename = os.path.basename(filename)
os.unlink("%s.pdb" % basename)
os.unlink("%s.ilk" % basename)
os.unlink(obj_name)
# os.unlink(obj_name)
except:
None
@@ -290,6 +318,7 @@ def run_tasks_from_queue(queue, queue_ret, queue_skip, total_tests_arg, max_test
ispc_exe = glob_var[3]
global is_generic_target
is_generic_target = glob_var[4]
global is_nvptx_target
global run_tests_log
run_tests_log = glob_var[5]
@@ -505,6 +534,8 @@ def run_tests(options1, args, print_version):
if options.target == 'neon':
options.arch = 'arm'
if options.target == "nvptx":
options.arch = "nvptx64"
# use relative path to not depend on host directory, which may possibly
# have white spaces and unicode characters.
@@ -530,9 +561,11 @@ def run_tests(options1, args, print_version):
print_debug("Testing ispc: " + ispc_exe + "\n", s, run_tests_log)
ispc_exe += " " + options.ispc_flags
global is_generic_target
global is_generic_target
global is_nvptx_target
is_generic_target = (options.target.find("generic-") != -1 and
options.target != "generic-1" and options.target != "generic-x1")
is_nvptx_target = (options.target.find("nvptx") != -1)
if is_generic_target and options.include_file == None:
if options.target == "generic-4" or options.target == "generic-x4":
error("No generics #include specified; using examples/intrinsics/sse4.h\n", 2)

View File

@@ -57,6 +57,31 @@
#error Unknown value of ISPC_MASK_BITS
#endif
///////////////////////////////////////////////////////////////////////////
// CUDA Specific primitives
//
/***************/
__declspec(safe,cost0) static inline varying int __programIndex() { return __program_index(); }
__declspec(safe,cost0) static inline uniform int __programCount() { return __program_count(); }
__declspec(safe,cost0) static inline uniform int __warpIndex() { return __warp_index(); }
/***************/
__declspec(safe,cost0) static inline uniform int __taskIndex0() { return __task_index0(); }
__declspec(safe,cost0) static inline uniform int __taskIndex1() { return __task_index1(); }
__declspec(safe,cost0) static inline uniform int __taskIndex2() { return __task_index2(); }
__declspec(safe,cost0) static inline uniform int __taskIndex () { return __task_index (); }
/***************/
__declspec(safe,cost0) static inline uniform int __taskCount0() { return __task_count0(); }
__declspec(safe,cost0) static inline uniform int __taskCount1() { return __task_count1(); }
__declspec(safe,cost0) static inline uniform int __taskCount2() { return __task_count2(); }
__declspec(safe,cost0) static inline uniform int __taskCount () { return __task_count (); }
///////////////////////////////////////////////////////////////////////////
// Low level primitives
@@ -464,7 +489,10 @@ __declspec(safe)
static inline uniform int popcnt(bool v) {
// As with any() and all(), only count across the active lanes
#if (ISPC_MASK_BITS == 1)
return __popcnt_int64(__movmsk(v & __mask));
if (__is_nvptx_target)
return __popcnt_int64(__movmsk_ptx(v & __mask));
else
return __popcnt_int64(__movmsk(v & __mask));
#else
return __popcnt_int64(__movmsk((UIntMaskType)__sext_varying_bool(v) & __mask));
#endif
@@ -1226,6 +1254,11 @@ packed_store_active(uniform int a[], int vals) {
return __packed_store_active(a, vals, (IntMaskType)__mask);
}
static inline uniform int
packed_store_active(bool active, uniform int a[], int vals) {
return __packed_store_active(a, vals, (IntMaskType)(-(int)active));
}
static inline uniform int
packed_store_active2(uniform int a[], int vals) {
return __packed_store_active2(a, vals, (IntMaskType)__mask);
@@ -1236,6 +1269,9 @@ packed_store_active2(uniform int a[], int vals) {
// System information
static inline uniform int num_cores() {
if (__is_nvptx_target)
return 15*32; // K20/K20X/K40 - 15SMX x 32 warps/smx (max is 64 warps/smx)
else
return __num_cores();
}
@@ -1783,7 +1819,7 @@ static inline void memory_barrier() {
__memory_barrier();
}
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE) \
#define DEFINE_ATOMIC_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
TA ret = __atomic_##OPB##_##TB##_global(ptr, value, (MASKTYPE)__mask); \
return ret; \
@@ -1794,6 +1830,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
return ret; \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1804,10 +1844,15 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, TA value) { \
ret = insert(ret, i, r); \
} \
return ret; \
} \
} \
#define DEFINE_ATOMIC_SWAP(TA,TB) \
#define DEFINE_ATOMIC_SWAP(TA,TB,MASKTYPE,TC) \
static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform int i = 0; \
TA ret[programCount]; \
TA memVal; \
@@ -1838,6 +1883,7 @@ static inline TA atomic_swap_global(uniform TA * uniform ptr, TA value) { \
originally got back from memory... */ \
ret[lastSwap] = memVal; \
return ret[programIndex]; \
}\
} \
static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
uniform TA value) { \
@@ -1845,6 +1891,10 @@ static inline uniform TA atomic_swap_global(uniform TA * uniform ptr, \
return ret; \
} \
static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_swap_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1855,9 +1905,10 @@ static inline TA atomic_swap_global(uniform TA * varying ptr, TA value) { \
ret = insert(ret, i, r); \
} \
return ret; \
}\
} \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB) \
#define DEFINE_ATOMIC_MINMAX_OP(TA,TB,OPA,OPB,MASKTYPE,TC) \
static inline TA atomic_##OPA##_global(uniform TA * uniform ptr, TA value) { \
uniform TA oneval = reduce_##OPA(value); \
TA ret; \
@@ -1872,6 +1923,10 @@ static inline uniform TA atomic_##OPA##_global(uniform TA * uniform ptr, \
} \
static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
TA value) { \
if (__is_nvptx_target) { \
TA ret = __atomic_##OPB##_varying_##TB##_global((TC)ptr, value, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1882,57 +1937,58 @@ static inline TA atomic_##OPA##_global(uniform TA * varying ptr, \
ret = insert(ret, i, r); \
} \
return ret; \
} \
}
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max)
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType)
DEFINE_ATOMIC_SWAP(int32,int32)
DEFINE_ATOMIC_OP(int32,int32,add,add,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,subtract,sub,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,min,min,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int32,int32,max,max,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,and,and,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,or,or,IntMaskType,int64)
DEFINE_ATOMIC_OP(int32,int32,xor,xor,IntMaskType,int64)
DEFINE_ATOMIC_SWAP(int32,int32,IntMaskType,int64)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax)
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType)
DEFINE_ATOMIC_SWAP(unsigned int32,int32)
DEFINE_ATOMIC_OP(unsigned int32,int32,add,add,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,subtract,sub,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,min,umin,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int32,uint32,max,umax,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,and,and,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,or,or,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_OP(unsigned int32,int32,xor,xor,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_SWAP(unsigned int32,int32,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_SWAP(float,float)
DEFINE_ATOMIC_SWAP(float,float,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max)
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType)
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType)
DEFINE_ATOMIC_SWAP(int64,int64)
DEFINE_ATOMIC_OP(int64,int64,add,add,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,subtract,sub,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,min,min,IntMaskType,int64)
DEFINE_ATOMIC_MINMAX_OP(int64,int64,max,max,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,and,and,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,or,or,IntMaskType,int64)
DEFINE_ATOMIC_OP(int64,int64,xor,xor,IntMaskType,int64)
DEFINE_ATOMIC_SWAP(int64,int64,IntMaskType, int64)
// For everything but atomic min and max, we can use the same
// implementations for unsigned as for signed.
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax)
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType)
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType)
DEFINE_ATOMIC_SWAP(unsigned int64,int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,add,add,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,subtract,sub,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,min,umin,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_MINMAX_OP(unsigned int64,uint64,max,umax,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,and,and,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,or,or,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_OP(unsigned int64,int64,xor,xor,UIntMaskType,unsigned int64)
DEFINE_ATOMIC_SWAP(unsigned int64,int64,UIntMaskType, unsigned int64)
DEFINE_ATOMIC_SWAP(double,double)
DEFINE_ATOMIC_SWAP(double,double,IntMaskType, int64)
#undef DEFINE_ATOMIC_OP
#undef DEFINE_ATOMIC_MINMAX_OP
#undef DEFINE_ATOMIC_SWAP
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE) \
#define ATOMIC_DECL_CMPXCHG(TA, TB, MASKTYPE, TC) \
static inline uniform TA atomic_compare_exchange_global( \
uniform TA * uniform ptr, uniform TA oldval, uniform TA newval) { \
uniform TA ret = \
@@ -1947,6 +2003,10 @@ static inline TA atomic_compare_exchange_global( \
} \
static inline TA atomic_compare_exchange_global( \
uniform TA * varying ptr, TA oldval, TA newval) { \
if (__is_nvptx_target) { \
TA ret = __atomic_compare_exchange_varying_##TB##_global((TC)ptr, oldval, newval, (MASKTYPE)__mask); \
return ret; \
} else { \
uniform TA * uniform ptrArray[programCount]; \
ptrArray[programIndex] = ptr; \
TA ret; \
@@ -1958,14 +2018,15 @@ static inline TA atomic_compare_exchange_global( \
ret = insert(ret, i, r); \
} \
return ret; \
} \
}
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType)
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType)
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType)
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType)
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType)
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType)
ATOMIC_DECL_CMPXCHG(int32, int32, IntMaskType,int64)
ATOMIC_DECL_CMPXCHG(unsigned int32, int32, UIntMaskType,unsigned int64)
ATOMIC_DECL_CMPXCHG(float, float, IntMaskType,int64)
ATOMIC_DECL_CMPXCHG(int64, int64, IntMaskType,int64)
ATOMIC_DECL_CMPXCHG(unsigned int64, int64, UIntMaskType,unsigned int64)
ATOMIC_DECL_CMPXCHG(double, double, IntMaskType,int64)
#undef ATOMIC_DECL_CMPXCHG
@@ -2032,12 +2093,20 @@ static inline TYPE atomic_##NAME##_local(uniform TYPE * uniform ptr, TYPE value)
} \
static inline TYPE atomic_##NAME##_local(uniform TYPE * p, TYPE value) { \
TYPE ret; \
if (__is_nvptx_target) { \
foreach_active (i) { \
uniform TYPE * uniform ptr = (uniform TYPE * uniform)extract((int64)p, i); \
ret = insert(ret, i, *ptr); \
*ptr = OPFUNC(*ptr, extract(value, i)); \
} \
} else { \
uniform TYPE * uniform ptrs[programCount]; \
ptrs[programIndex] = p; \
foreach_active (i) { \
ret = insert(ret, i, *ptrs[i]); \
*ptrs[i] = OPFUNC(*ptrs[i], extract(value, i)); \
} \
} \
return ret; \
}

727
stmt.cpp
View File

@@ -142,6 +142,62 @@ lHasUnsizedArrays(const Type *type) {
return lHasUnsizedArrays(at->GetElementType());
}
static llvm::Value* lConvertToGenericPtr(FunctionEmitContext *ctx, llvm::Value *value, const SourcePos &currentPos, const bool variable = false)
{
if (!value->getType()->isPointerTy() || g->target->getISA() != Target::NVPTX)
return value;
llvm::PointerType *pt = llvm::dyn_cast<llvm::PointerType>(value->getType());
const int addressSpace = pt->getAddressSpace();
if (addressSpace != 3 && addressSpace != 4)
return value;
llvm::Type *elTy = pt->getElementType();
/* convert elTy addrspace(3)* to i64* addrspace(3)* */
llvm::PointerType *Int64Ptr3 = llvm::PointerType::get(LLVMTypes::Int64Type, addressSpace);
value = ctx->BitCastInst(value, Int64Ptr3, "gep2gen_cast1");
/* convert i64* addrspace(3) to i64* */
llvm::Function *__cvt2gen = m->module->getFunction(
addressSpace == 3 ? (variable ? "__cvt_loc2gen_var" : "__cvt_loc2gen") : "__cvt_const2gen");
std::vector<llvm::Value *> __cvt2gen_args;
__cvt2gen_args.push_back(value);
value = llvm::CallInst::Create(__cvt2gen, __cvt2gen_args, variable ? "gep2gen_cvt_var" : "gep2gen_cvt", ctx->GetCurrentBasicBlock());
/* compute offset */
if (addressSpace == 3)
{
assert(elTy->isArrayTy());
const int numElTot = elTy->getArrayNumElements();
const int numEl = numElTot/4;
#if 0
fprintf(stderr, " --- detected addrspace(3) sz= %d --- \n", numEl);
#endif
llvm::ArrayType *arrTy = llvm::dyn_cast<llvm::ArrayType>(pt->getArrayElementType());
assert(arrTy != NULL);
llvm::Type *arrElTy = arrTy->getElementType();
#if 0
if (arrElTy->isArrayTy())
Error(currentPos, "Currently \"nvptx\" target doesn't support array-of-array");
#endif
/* convert i64* to errElTy* */
llvm::PointerType *arrElTyPt0 = llvm::PointerType::get(arrElTy, 0);
value = ctx->BitCastInst(value, arrElTyPt0, "gep2gen_cast2");
llvm::Function *func_warp_index = m->module->getFunction("__warp_index");
llvm::Value *warpId = ctx->CallInst(func_warp_index, NULL, std::vector<llvm::Value*>(), "gep2gen_warp_index");
llvm::Value *offset = ctx->BinaryOperator(llvm::Instruction::Mul, warpId, LLVMInt32(numEl), "gep2gen_offset");
value = llvm::GetElementPtrInst::Create(value, offset, "gep2gen_offset", ctx->GetCurrentBasicBlock());
}
/* convert arrElTy* to elTy* */
llvm::PointerType *elTyPt0 = llvm::PointerType::get(elTy, 0);
value = ctx->BitCastInst(value, elTyPt0, "gep2gen_cast3");
return value;
}
void
DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
@@ -205,7 +261,22 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
return;
}
if (sym->storageClass == SC_STATIC) {
if (g->target->getISA() == Target::NVPTX && !sym->type->IsConstType())
PerformanceWarning(sym->pos,
"Non-constant static variable ""\"%s\" is stored in __global address sace with ""\"nvptx\" target.",
sym->name.c_str());
if (g->target->getISA() == Target::NVPTX && sym->type->IsVaryingType())
PerformanceWarning(sym->pos,
"\"const static varying\" variable ""\"%s\" is stored in __global address space with ""\"nvptx\" target.",
sym->name.c_str());
if (g->target->getISA() == Target::NVPTX && sym->type->IsUniformType())
PerformanceWarning(sym->pos,
"\"const static uniform\" variable ""\"%s\" is stored in __constant address space with ""\"nvptx\" target.",
sym->name.c_str());
// For static variables, we need a compile-time constant value
// for its initializer; if there's no initializer, we use a
// zero value.
@@ -233,19 +304,97 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
if (cinit == NULL)
cinit = llvm::Constant::getNullValue(llvmType);
int addressSpace = 0;
if (g->target->getISA() == Target::NVPTX &&
sym->type->IsConstType() &&
sym->type->IsUniformType())
addressSpace = 4;
// Allocate space for the static variable in global scope, so
// that it persists across function calls
sym->storagePtr =
new llvm::GlobalVariable(*m->module, llvmType,
sym->type->IsConstType(),
llvm::GlobalValue::InternalLinkage, cinit,
llvm::Twine("static.") +
llvm::Twine("static_") +
llvm::Twine(sym->pos.first_line) +
llvm::Twine(".") + sym->name.c_str());
llvm::Twine("_") + sym->name.c_str(),
NULL,
llvm::GlobalVariable::NotThreadLocal,
addressSpace);
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos);
// Tell the FunctionEmitContext about the variable
ctx->EmitVariableDebugInfo(sym);
}
else {
else if ((sym->type->IsUniformType() || sym->type->IsSOAType()) &&
/* NVPTX:
* only non-constant uniform data types are stored in shared memory
* constant uniform are automatically promoted to varying
*/
!sym->type->IsConstType() &&
#if 1
sym->type->IsArrayType() &&
#endif
g->target->getISA() == Target::NVPTX)
{
PerformanceWarning(sym->pos,
"Non-constant \"uniform\" data types might be slow with \"nvptx\" target. "
"Unless data sharing between program instances is desired, try \"const [static] uniform\", \"varying\" or \"uniform new uniform \"+\"delete\" if possible.");
/* with __shared__ memory everything must be an array */
int nel = 4;
ArrayType *nat;
bool variable = true;
if (sym->type->IsArrayType())
{
const ArrayType *at = CastType<ArrayType>(sym->type);
/* we must scale # elements by 4, because a thread-block will run 4 warps
* or 128 threads.
* ***note-to-me***:please define these value (128threads/4warps)
* in nvptx-target definition
* instead of compile-time constants
*/
nel *= at->GetElementCount();
if (sym->type->IsSOAType())
nel *= sym->type->GetSOAWidth();
nat = new ArrayType(at->GetElementType(), nel);
variable = false;
}
else
nat = new ArrayType(sym->type, nel);
llvm::Type *llvmTypeUn = nat->LLVMType(g->ctx);
llvm::Constant *cinit = llvm::UndefValue::get(llvmTypeUn);
sym->storagePtr =
new llvm::GlobalVariable(*m->module, llvmTypeUn,
sym->type->IsConstType(),
llvm::GlobalValue::InternalLinkage,
cinit,
llvm::Twine("local_") +
llvm::Twine(sym->pos.first_line) +
llvm::Twine("_") + sym->name.c_str(),
NULL,
llvm::GlobalVariable::NotThreadLocal,
/*AddressSpace=*/3);
sym->storagePtr = lConvertToGenericPtr(ctx, sym->storagePtr, sym->pos, variable);
llvm::PointerType *ptrTy = llvm::PointerType::get(sym->type->LLVMType(g->ctx),0);
sym->storagePtr = ctx->BitCastInst(sym->storagePtr, ptrTy, "uniform_decl");
// Tell the FunctionEmitContext about the variable; must do
// this before the initializer stuff.
ctx->EmitVariableDebugInfo(sym);
if (initExpr == 0 && sym->type->IsConstType())
Error(sym->pos, "Missing initializer for const variable "
"\"%s\".", sym->name.c_str());
// And then get it initialized...
sym->parentFunction = ctx->GetFunction();
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
}
else
{
// For non-static variables, allocate storage on the stack
sym->storagePtr = ctx->AllocaInst(llvmType, sym->name.c_str());
@@ -253,10 +402,14 @@ DeclStmt::EmitCode(FunctionEmitContext *ctx) const {
// this before the initializer stuff.
ctx->EmitVariableDebugInfo(sym);
if (initExpr == 0 && sym->type->IsConstType())
Error(sym->pos, "Missing initializer for const variable "
"\"%s\".", sym->name.c_str());
// And then get it initialized...
sym->parentFunction = ctx->GetFunction();
InitSymbol(sym->storagePtr, sym->type, initExpr, ctx, sym->pos);
}
}
}
}
@@ -415,6 +568,19 @@ IfStmt::EmitCode(FunctionEmitContext *ctx) const {
if (testValue == NULL)
return;
#if 0
if (!isUniform && g->target->getISA() == Target::NVPTX)
{
/* With "nvptx" target, SIMT hardware takes care of non-uniform
* control flow. We trick ISPC to generate uniform control flow.
*/
testValue = ctx->ExtractInst(testValue, 0);
isUniform = true;
}
#endif
if (isUniform) {
ctx->StartUniformIf();
if (doAllCheck)
@@ -696,7 +862,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
// Do any of the program instances want to run the 'true'
// block? If not, jump ahead to bNext.
#if 1
llvm::Value *maskAnyTrueQ = ctx->Any(ctx->GetFullMask());
#else
llvm::Value *maskAnyTrueQ = ctx->ExtractInst(ctx->GetFullMask(),0);
#endif
ctx->BranchInst(bRunTrue, bNext, maskAnyTrueQ);
// Emit statements for true
@@ -713,7 +883,11 @@ IfStmt::emitMaskMixed(FunctionEmitContext *ctx, llvm::Value *oldMask,
// Similarly, check to see if any of the instances want to
// run the 'false' block...
#if 1
llvm::Value *maskAnyFalseQ = ctx->Any(ctx->GetFullMask());
#else
llvm::Value *maskAnyFalseQ = ctx->ExtractInst(ctx->GetFullMask(),0);
#endif
ctx->BranchInst(bRunFalse, bDone, maskAnyFalseQ);
// Emit code for false
@@ -1273,7 +1447,10 @@ static llvm::Value *
lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
llvm::Value *uniformCounterPtr,
llvm::Value *varyingCounterPtr,
const std::vector<int> &spans) {
const std::vector<int> &spans)
{
if (g->target->getISA() != Target::NVPTX)
{
// Smear the uniform counter value out to be varying
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
llvm::Value *smearCounter = ctx->BroadcastValue(
@@ -1306,6 +1483,93 @@ lUpdateVaryingCounter(int dim, int nDims, FunctionEmitContext *ctx,
LLVMInt32Vector(delta), "iter_val");
ctx->StoreInst(varyingCounter, varyingCounterPtr);
return varyingCounter;
}
else /* NVPTX == true */
{
// Smear the uniform counter value out to be varying
llvm::Value *counter = ctx->LoadInst(uniformCounterPtr);
llvm::Value *smearCounter = ctx->BroadcastValue(
counter, LLVMTypes::Int32VectorType, "smear_counter");
// Figure out the offsets; this is a little bit tricky. As an example,
// consider a 2D tiled foreach loop, where we're running 8-wide and
// where the inner dimension has a stride of 4 and the outer dimension
// has a stride of 2. For the inner dimension, we want the offsets
// (0,1,2,3,0,1,2,3), and for the outer dimension we want
// (0,0,0,0,1,1,1,1).
int32_t delta[ISPC_MAX_NVEC];
const int vecWidth = 32;
std::vector<llvm::Constant*> constDeltaList;
for (int i = 0; i < vecWidth; ++i)
{
int d = i;
// First, account for the effect of any dimensions at deeper
// nesting levels than the current one.
int prevDimSpanCount = 1;
for (int j = dim; j < nDims-1; ++j)
prevDimSpanCount *= spans[j+1];
d /= prevDimSpanCount;
// And now with what's left, figure out our own offset
delta[i] = d % spans[dim];
constDeltaList.push_back(LLVMInt8(delta[i]));
}
llvm::ArrayType* ArrayDelta = llvm::ArrayType::get(LLVMTypes::Int8Type, 32);
// llvm::PointerType::get(ArrayDelta, 4); /* constant memory */
llvm::GlobalVariable* globalDelta = new llvm::GlobalVariable(
/*Module=*/*m->module,
/*Type=*/ArrayDelta,
/*isConstant=*/true,
/*Linkage=*/llvm::GlobalValue::PrivateLinkage,
/*Initializer=*/0, // has initializer, specified below
/*Name=*/"constDeltaForeach");
#if 0
/*ThreadLocalMode=*/llvm::GlobalVariable::NotThreadLocal,
/*unsigned AddressSpace=*/4 /*constant*/);
#endif
llvm::Constant* constDelta = llvm::ConstantArray::get(ArrayDelta, constDeltaList);
globalDelta->setInitializer(constDelta);
llvm::Function *func_program_index = m->module->getFunction("__program_index");
llvm::Value *laneIdx = ctx->CallInst(func_program_index, NULL, std::vector<llvm::Value*>(), "foreach__programIndex");
std::vector<llvm::Value*> ptr_arrayidx_indices;
ptr_arrayidx_indices.push_back(LLVMInt32(0));
ptr_arrayidx_indices.push_back(laneIdx);
#if 1
llvm::Instruction* ptr_arrayidx = llvm::GetElementPtrInst::Create(globalDelta, ptr_arrayidx_indices, "arrayidx", ctx->GetCurrentBasicBlock());
llvm::LoadInst* int8_39 = new llvm::LoadInst(ptr_arrayidx, "", false, ctx->GetCurrentBasicBlock());
llvm::Value * int32_39 = ctx->ZExtInst(int8_39, LLVMTypes::Int32Type);
llvm::VectorType* VectorTy_2 = llvm::VectorType::get(llvm::IntegerType::get(*g->ctx, 32), 1);
llvm::UndefValue* const_packed_41 = llvm::UndefValue::get(VectorTy_2);
llvm::InsertElementInst* packed_43 = llvm::InsertElementInst::Create(
// llvm::UndefValue(LLVMInt32Vector),
const_packed_41,
int32_39, LLVMInt32(0), "", ctx->GetCurrentBasicBlock());
#endif
// Add the deltas to compute the varying counter values; store the
// result to memory and then return it directly as well.
#if 0
llvm::Value *varyingCounter =
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
LLVMInt32Vector(delta), "iter_val");
#else
llvm::Value *varyingCounter =
ctx->BinaryOperator(llvm::Instruction::Add, smearCounter,
packed_43, "iter_val");
#endif
ctx->StoreInst(varyingCounter, varyingCounterPtr);
return varyingCounter;
}
}
@@ -1383,7 +1647,7 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// This should be caught during typechecking
AssertPos(pos, startExprs.size() == dimVariables.size() &&
endExprs.size() == dimVariables.size());
endExprs.size() == dimVariables.size());
int nDims = (int)dimVariables.size();
///////////////////////////////////////////////////////////////////////
@@ -1394,64 +1658,66 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
std::vector<llvm::Value *> nExtras, alignedEnd, extrasMaskPtrs;
std::vector<int> span(nDims, 0);
lGetSpans(nDims-1, nDims, g->target->getVectorWidth(), isTiled, &span[0]);
const int vectorWidth =
g->target->getISA() == Target::NVPTX ? 32 : g->target->getVectorWidth();
lGetSpans(nDims-1, nDims, vectorWidth, isTiled, &span[0]);
for (int i = 0; i < nDims; ++i) {
// Basic blocks that we'll fill in later with the looping logic for
// this dimension.
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
if (i < nDims-1)
// stepping for the innermost dimension is handled specially
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
// Basic blocks that we'll fill in later with the looping logic for
// this dimension.
bbReset.push_back(ctx->CreateBasicBlock("foreach_reset"));
if (i < nDims-1)
// stepping for the innermost dimension is handled specially
bbStep.push_back(ctx->CreateBasicBlock("foreach_step"));
bbTest.push_back(ctx->CreateBasicBlock("foreach_test"));
// Start and end value for this loop dimension
llvm::Value *sv = startExprs[i]->GetValue(ctx);
llvm::Value *ev = endExprs[i]->GetValue(ctx);
if (sv == NULL || ev == NULL)
return;
startVals.push_back(sv);
endVals.push_back(ev);
// Start and end value for this loop dimension
llvm::Value *sv = startExprs[i]->GetValue(ctx);
llvm::Value *ev = endExprs[i]->GetValue(ctx);
if (sv == NULL || ev == NULL)
return;
startVals.push_back(sv);
endVals.push_back(ev);
// nItems = endVal - startVal
llvm::Value *nItems =
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
// nItems = endVal - startVal
llvm::Value *nItems =
ctx->BinaryOperator(llvm::Instruction::Sub, ev, sv, "nitems");
// nExtras = nItems % (span for this dimension)
// This gives us the number of extra elements we need to deal with
// at the end of the loop for this dimension that don't fit cleanly
// into a vector width.
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
LLVMInt32(span[i]), "nextras"));
// nExtras = nItems % (span for this dimension)
// This gives us the number of extra elements we need to deal with
// at the end of the loop for this dimension that don't fit cleanly
// into a vector width.
nExtras.push_back(ctx->BinaryOperator(llvm::Instruction::SRem, nItems,
LLVMInt32(span[i]), "nextras"));
// alignedEnd = endVal - nExtras
alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
nExtras[i], "aligned_end"));
// alignedEnd = endVal - nExtras
alignedEnd.push_back(ctx->BinaryOperator(llvm::Instruction::Sub, ev,
nExtras[i], "aligned_end"));
///////////////////////////////////////////////////////////////////////
// Each dimension has a loop counter that is a uniform value that
// goes from startVal to endVal, in steps of the span for this
// dimension. Its value is only used internally here for looping
// logic and isn't directly available in the user's program code.
uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
"counter"));
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
///////////////////////////////////////////////////////////////////////
// Each dimension has a loop counter that is a uniform value that
// goes from startVal to endVal, in steps of the span for this
// dimension. Its value is only used internally here for looping
// logic and isn't directly available in the user's program code.
uniformCounterPtrs.push_back(ctx->AllocaInst(LLVMTypes::Int32Type,
"counter"));
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
// There is also a varying variable that holds the set of index
// values for each dimension in the current loop iteration; this is
// the value that is program-visible.
dimVariables[i]->storagePtr =
ctx->AllocaInst(LLVMTypes::Int32VectorType,
dimVariables[i]->name.c_str());
dimVariables[i]->parentFunction = ctx->GetFunction();
ctx->EmitVariableDebugInfo(dimVariables[i]);
// There is also a varying variable that holds the set of index
// values for each dimension in the current loop iteration; this is
// the value that is program-visible.
dimVariables[i]->storagePtr =
ctx->AllocaInst(LLVMTypes::Int32VectorType,
dimVariables[i]->name.c_str());
dimVariables[i]->parentFunction = ctx->GetFunction();
ctx->EmitVariableDebugInfo(dimVariables[i]);
// Each dimension also maintains a mask that represents which of
// the varying elements in the current iteration should be
// processed. (i.e. this is used to disable the lanes that have
// out-of-bounds offsets.)
extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
// Each dimension also maintains a mask that represents which of
// the varying elements in the current iteration should be
// processed. (i.e. this is used to disable the lanes that have
// out-of-bounds offsets.)
extrasMaskPtrs.push_back(ctx->AllocaInst(LLVMTypes::MaskType, "extras mask"));
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
}
ctx->StartForeach(FunctionEmitContext::FOREACH_REGULAR);
@@ -1464,14 +1730,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// a given dimension in preparation for running through its loop again,
// after the enclosing level advances its counter.
for (int i = 0; i < nDims; ++i) {
ctx->SetCurrentBasicBlock(bbReset[i]);
if (i == 0)
ctx->BranchInst(bbExit);
else {
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
ctx->BranchInst(bbStep[i-1]);
}
ctx->SetCurrentBasicBlock(bbReset[i]);
if (i == 0)
ctx->BranchInst(bbExit);
else {
ctx->StoreInst(LLVMMaskAllOn, extrasMaskPtrs[i]);
ctx->StoreInst(startVals[i], uniformCounterPtrs[i]);
ctx->BranchInst(bbStep[i-1]);
}
}
///////////////////////////////////////////////////////////////////////////
@@ -1481,67 +1747,67 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// this for the innermost dimension, which has a more complex stepping
// structure..
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbStep[i]);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[i]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
ctx->BranchInst(bbTest[i]);
ctx->SetCurrentBasicBlock(bbStep[i]);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[i]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[i]);
ctx->BranchInst(bbTest[i]);
}
///////////////////////////////////////////////////////////////////////////
// foreach_test (for all dimensions other than the innermost...)
std::vector<llvm::Value *> inExtras;
for (int i = 0; i < nDims-1; ++i) {
ctx->SetCurrentBasicBlock(bbTest[i]);
ctx->SetCurrentBasicBlock(bbTest[i]);
llvm::Value *haveExtras =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
endVals[i], alignedEnd[i], "have_extras");
llvm::Value *haveExtras =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SGT,
endVals[i], alignedEnd[i], "have_extras");
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
llvm::Value *atAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
counter, alignedEnd[i], "at_aligned_end");
llvm::Value *inEx =
ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
atAlignedEnd, "in_extras");
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[i], "counter");
llvm::Value *atAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ,
counter, alignedEnd[i], "at_aligned_end");
llvm::Value *inEx =
ctx->BinaryOperator(llvm::Instruction::And, haveExtras,
atAlignedEnd, "in_extras");
if (i == 0)
inExtras.push_back(inEx);
else
inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
inExtras[i-1], "in_extras_all"));
if (i == 0)
inExtras.push_back(inEx);
else
inExtras.push_back(ctx->BinaryOperator(llvm::Instruction::Or, inEx,
inExtras[i-1], "in_extras_all"));
llvm::Value *varyingCounter =
lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
dimVariables[i]->storagePtr, span);
llvm::Value *varyingCounter =
lUpdateVaryingCounter(i, nDims, ctx, uniformCounterPtrs[i],
dimVariables[i]->storagePtr, span);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[i], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[i], LLVMTypes::Int32VectorType, "smear_end");
// Do a vector compare of its value to the end value to generate a
// mask for this last bit of work.
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
// Do a vector compare of its value to the end value to generate a
// mask for this last bit of work.
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
if (i == 0)
ctx->StoreInst(emask, extrasMaskPtrs[i]);
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->StoreInst(newMask, extrasMaskPtrs[i]);
}
if (i == 0)
ctx->StoreInst(emask, extrasMaskPtrs[i]);
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[i-1]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->StoreInst(newMask, extrasMaskPtrs[i]);
}
llvm::Value *notAtEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[i]);
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
llvm::Value *notAtEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[i]);
ctx->BranchInst(bbTest[i+1], bbReset[i], notAtEnd);
}
///////////////////////////////////////////////////////////////////////////
@@ -1578,18 +1844,18 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// (i.e. processing extra elements that don't exactly fit into a
// vector).
llvm::BasicBlock *bbOuterInExtras =
ctx->CreateBasicBlock("outer_in_extras");
ctx->CreateBasicBlock("outer_in_extras");
llvm::BasicBlock *bbOuterNotInExtras =
ctx->CreateBasicBlock("outer_not_in_extras");
ctx->CreateBasicBlock("outer_not_in_extras");
ctx->SetCurrentBasicBlock(bbTest[nDims-1]);
if (inExtras.size())
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
inExtras.back());
ctx->BranchInst(bbOuterInExtras, bbOuterNotInExtras,
inExtras.back());
else
// for a 1D iteration domain, we certainly don't have any enclosing
// dimensions that are processing extra elements.
ctx->BranchInst(bbOuterNotInExtras);
// for a 1D iteration domain, we certainly don't have any enclosing
// dimensions that are processing extra elements.
ctx->BranchInst(bbOuterNotInExtras);
///////////////////////////////////////////////////////////////////////////
// One or more outer dimensions in extras, so we need to mask for the loop
@@ -1604,21 +1870,21 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// // run loop body with mask
// }
llvm::BasicBlock *bbAllInnerPartialOuter =
ctx->CreateBasicBlock("all_inner_partial_outer");
ctx->CreateBasicBlock("all_inner_partial_outer");
llvm::BasicBlock *bbPartial =
ctx->CreateBasicBlock("both_partial");
ctx->CreateBasicBlock("both_partial");
ctx->SetCurrentBasicBlock(bbOuterInExtras); {
// Update the varying counter value here, since all subsequent
// blocks along this path need it.
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
// Update the varying counter value here, since all subsequent
// blocks along this path need it.
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
// here we just check to see if counter < alignedEnd
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
// here we just check to see if counter < alignedEnd
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbAllInnerPartialOuter, bbPartial, beforeAlignedEnd);
}
// Below we have a basic block that runs the loop body code for the
@@ -1637,53 +1903,53 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// should step the loop counter for the next enclosing dimension
// instead.
llvm::Value *stepIndexAfterMaskedBodyPtr =
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
ctx->AllocaInst(LLVMTypes::BoolType, "step_index");
///////////////////////////////////////////////////////////////////////////
// We're in the inner loop part where the only masking is due to outer
// dimensions but the innermost dimension fits fully into a vector's
// width. Set the mask and jump to the masked loop body.
ctx->SetCurrentBasicBlock(bbAllInnerPartialOuter); {
llvm::Value *mask;
if (nDims == 1)
// 1D loop; we shouldn't ever get here anyway
mask = LLVMMaskAllOff;
else
mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
llvm::Value *mask;
if (nDims == 1)
// 1D loop; we shouldn't ever get here anyway
mask = LLVMMaskAllOff;
else
mask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
ctx->SetInternalMask(mask);
ctx->SetInternalMask(mask);
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
ctx->StoreInst(LLVMTrue, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
// We need to include the effect of the innermost dimension in the mask
// for the final bits here
ctx->SetCurrentBasicBlock(bbPartial); {
llvm::Value *varyingCounter =
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *varyingCounter =
ctx->LoadInst(dimVariables[nDims-1]->storagePtr);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
if (nDims == 1) {
ctx->SetInternalMask(emask);
}
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->SetInternalMask(newMask);
}
if (nDims == 1) {
ctx->SetInternalMask(emask);
}
else {
llvm::Value *oldMask = ctx->LoadInst(extrasMaskPtrs[nDims-2]);
llvm::Value *newMask =
ctx->BinaryOperator(llvm::Instruction::And, oldMask, emask,
"extras_mask");
ctx->SetInternalMask(newMask);
}
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
@@ -1699,14 +1965,14 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// // run loop body with mask
// }
llvm::BasicBlock *bbPartialInnerAllOuter =
ctx->CreateBasicBlock("partial_inner_all_outer");
ctx->CreateBasicBlock("partial_inner_all_outer");
ctx->SetCurrentBasicBlock(bbOuterNotInExtras); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
beforeAlignedEnd);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeAlignedEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, alignedEnd[nDims-1], "before_aligned_end");
ctx->BranchInst(bbFullBody, bbPartialInnerAllOuter,
beforeAlignedEnd);
}
///////////////////////////////////////////////////////////////////////////
@@ -1716,26 +1982,26 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// value of the varying loop counter and have the statements in the
// loop body emit their code.
llvm::BasicBlock *bbFullBodyContinue =
ctx->CreateBasicBlock("foreach_full_continue");
ctx->CreateBasicBlock("foreach_full_continue");
ctx->SetCurrentBasicBlock(bbFullBody); {
ctx->SetInternalMask(LLVMMaskAllOn);
ctx->SetBlockEntryMask(LLVMMaskAllOn);
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
ctx->SetContinueTarget(bbFullBodyContinue);
ctx->AddInstrumentationPoint("foreach loop body (all on)");
stmts->EmitCode(ctx);
AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
ctx->BranchInst(bbFullBodyContinue);
ctx->SetInternalMask(LLVMMaskAllOn);
ctx->SetBlockEntryMask(LLVMMaskAllOn);
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
ctx->SetContinueTarget(bbFullBodyContinue);
ctx->AddInstrumentationPoint("foreach loop body (all on)");
stmts->EmitCode(ctx);
AssertPos(pos, ctx->GetCurrentBasicBlock() != NULL);
ctx->BranchInst(bbFullBodyContinue);
}
ctx->SetCurrentBasicBlock(bbFullBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterNotInExtras);
ctx->RestoreContinuedLanes();
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterNotInExtras);
}
///////////////////////////////////////////////////////////////////////////
@@ -1743,33 +2009,33 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// less than the end value, in which case we need to run the body one
// more time to get the extra bits.
llvm::BasicBlock *bbSetInnerMask =
ctx->CreateBasicBlock("partial_inner_only");
ctx->CreateBasicBlock("partial_inner_only");
ctx->SetCurrentBasicBlock(bbPartialInnerAllOuter); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeFullEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[nDims-1], "before_full_end");
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1], "counter");
llvm::Value *beforeFullEnd =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
counter, endVals[nDims-1], "before_full_end");
ctx->BranchInst(bbSetInnerMask, bbReset[nDims-1], beforeFullEnd);
}
///////////////////////////////////////////////////////////////////////////
// The outer dimensions are all on, so the mask is just given by the
// mask for the innermost dimension
ctx->SetCurrentBasicBlock(bbSetInnerMask); {
llvm::Value *varyingCounter =
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
ctx->SetInternalMask(emask);
ctx->SetBlockEntryMask(emask);
llvm::Value *varyingCounter =
lUpdateVaryingCounter(nDims-1, nDims, ctx, uniformCounterPtrs[nDims-1],
dimVariables[nDims-1]->storagePtr, span);
llvm::Value *smearEnd = ctx->BroadcastValue(
endVals[nDims-1], LLVMTypes::Int32VectorType, "smear_end");
llvm::Value *emask =
ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_SLT,
varyingCounter, smearEnd);
emask = ctx->I1VecToBoolVec(emask);
ctx->SetInternalMask(emask);
ctx->SetBlockEntryMask(emask);
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
ctx->StoreInst(LLVMFalse, stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbMaskedBody);
}
///////////////////////////////////////////////////////////////////////////
@@ -1779,34 +2045,34 @@ ForeachStmt::EmitCode(FunctionEmitContext *ctx) const {
// mask known to be all-on, which in turn leads to more efficient code
// for that case.
llvm::BasicBlock *bbStepInnerIndex =
ctx->CreateBasicBlock("step_inner_index");
ctx->CreateBasicBlock("step_inner_index");
llvm::BasicBlock *bbMaskedBodyContinue =
ctx->CreateBasicBlock("foreach_masked_continue");
ctx->CreateBasicBlock("foreach_masked_continue");
ctx->SetCurrentBasicBlock(bbMaskedBody); {
ctx->AddInstrumentationPoint("foreach loop body (masked)");
ctx->SetContinueTarget(bbMaskedBodyContinue);
ctx->DisableGatherScatterWarnings();
ctx->SetBlockEntryMask(ctx->GetFullMask());
stmts->EmitCode(ctx);
ctx->EnableGatherScatterWarnings();
ctx->BranchInst(bbMaskedBodyContinue);
ctx->AddInstrumentationPoint("foreach loop body (masked)");
ctx->SetContinueTarget(bbMaskedBodyContinue);
ctx->DisableGatherScatterWarnings();
ctx->SetBlockEntryMask(ctx->GetFullMask());
stmts->EmitCode(ctx);
ctx->EnableGatherScatterWarnings();
ctx->BranchInst(bbMaskedBodyContinue);
}
ctx->SetCurrentBasicBlock(bbMaskedBodyContinue); {
ctx->RestoreContinuedLanes();
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
ctx->RestoreContinuedLanes();
llvm::Value *stepIndex = ctx->LoadInst(stepIndexAfterMaskedBodyPtr);
ctx->BranchInst(bbStepInnerIndex, bbReset[nDims-1], stepIndex);
}
///////////////////////////////////////////////////////////////////////////
// step the innermost index, for the case where we're doing the
// innermost for loop over full vectors.
ctx->SetCurrentBasicBlock(bbStepInnerIndex); {
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterInExtras);
llvm::Value *counter = ctx->LoadInst(uniformCounterPtrs[nDims-1]);
llvm::Value *newCounter =
ctx->BinaryOperator(llvm::Instruction::Add, counter,
LLVMInt32(span[nDims-1]), "new_counter");
ctx->StoreInst(newCounter, uniformCounterPtrs[nDims-1]);
ctx->BranchInst(bbOuterInExtras);
}
///////////////////////////////////////////////////////////////////////////
@@ -1993,7 +2259,8 @@ ForeachActiveStmt::EmitCode(FunctionEmitContext *ctx) const {
// math...)
// Get the "program index" vector value
llvm::Value *programIndex = ctx->ProgramIndexVector();
llvm::Value *programIndex = g->target->getISA() == Target::NVPTX ?
ctx->ProgramIndexVectorPTX() : ctx->ProgramIndexVector();
// And smear the current lane out to a vector
llvm::Value *firstSet32 =
@@ -2189,10 +2456,19 @@ ForeachUniqueStmt::EmitCode(FunctionEmitContext *ctx) const {
// And load the corresponding element value from the temporary
// memory storing the value of the varying expr.
llvm::Value *uniqueValuePtr =
llvm::Value *uniqueValue;
if (g->target->getISA() != Target::NVPTX)
{
llvm::Value *uniqueValuePtr =
ctx->GetElementPtrInst(exprMem, LLVMInt64(0), firstSet, exprPtrType,
"unique_index_ptr");
llvm::Value *uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
"unique_index_ptr");
uniqueValue = ctx->LoadInst(uniqueValuePtr, "unique_value");
}
else /* in case of PTX target, use __shfl PTX intrinsics via __insert/__extract function */
{
llvm::Value *firstSet32 = ctx->TruncInst(firstSet, LLVMTypes::Int32Type);
uniqueValue = ctx->Extract(exprValue, firstSet32);
}
// If it's a varying pointer type, need to convert from the int
// type we store in the vector to the actual pointer type
@@ -3100,7 +3376,8 @@ PrintStmt::EmitCode(FunctionEmitContext *ctx) const {
}
// Now we can emit code to call __do_print()
llvm::Function *printFunc = m->module->getFunction("__do_print");
llvm::Function *printFunc = g->target->getISA() != Target::NVPTX ?
m->module->getFunction("__do_print") : m->module->getFunction("__do_print_nvptx");
AssertPos(pos, printFunc);
llvm::Value *mask = ctx->GetFullMask();

440
test_static_cuda.cpp Normal file
View File

@@ -0,0 +1,440 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_WIN32) || defined(_WIN64)
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#ifdef ISPC_IS_WINDOWS
#include <windows.h>
#endif // ISPC_IS_WINDOWS
#include <cassert>
#include <cstring>
#include <cstdio>
#include <cstdint>
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif
/******************************/
#include <cassert>
#include <iostream>
#include <cuda.h>
#include "drvapi_error_string.h"
#include "ispc_malloc.h"
#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
void __checkCudaErrors(CUresult err, const char *file, const int line) {
if(CUDA_SUCCESS != err) {
std::cerr << "checkCudeErrors() Driver API error = " << err << "\""
<< getCudaDrvErrorString(err) << "\" from file <" << file
<< ", line " << line << "\n";
exit(-1);
}
}
/******************************/
/**** Basic CUDriver API ****/
/******************************/
CUcontext context;
static void createContext(const int deviceId = 0, const bool verbose = true)
{
CUdevice device;
int devCount;
checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&devCount));
assert(devCount > 0);
checkCudaErrors(cuDeviceGet(&device, deviceId < devCount ? deviceId : 0));
char name[128];
checkCudaErrors(cuDeviceGetName(name, 128, device));
if (verbose)
std::cout << "Using CUDA Device [0]: " << name << "\n";
int devMajor, devMinor;
checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
if (verbose)
std::cout << "Device Compute Capability: "
<< devMajor << "." << devMinor << "\n";
if (devMajor < 2) {
if (verbose)
std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
exit(1);
}
// Create driver context
checkCudaErrors(cuCtxCreate(&context, 0, device));
}
static void destroyContext()
{
checkCudaErrors(cuCtxDestroy(context));
}
static CUmodule loadModule(
const char * module,
const int maxrregcount = 64,
const char cudadevrt_lib[] = "libcudadevrt.a",
const size_t log_size = 32768,
const bool print_log = true
)
{
CUmodule cudaModule;
// in this branch we use compilation with parameters
CUlinkState CUState;
CUlinkState *lState = &CUState;
const int nOptions = 8;
CUjit_option options[nOptions];
void* optionVals[nOptions];
float walltime;
size_t logSize = log_size;
char error_log[logSize],
info_log[logSize];
void *cuOut;
size_t outSize;
int myErr = 0;
// Setup linker options
// Return walltime from JIT compilation
options[0] = CU_JIT_WALL_TIME;
optionVals[0] = (void*) &walltime;
// Pass a buffer for info messages
options[1] = CU_JIT_INFO_LOG_BUFFER;
optionVals[1] = (void*) info_log;
// Pass the size of the info buffer
options[2] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
optionVals[2] = (void*) logSize;
// Pass a buffer for error message
options[3] = CU_JIT_ERROR_LOG_BUFFER;
optionVals[3] = (void*) error_log;
// Pass the size of the error buffer
options[4] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
optionVals[4] = (void*) logSize;
// Make the linker verbose
options[5] = CU_JIT_LOG_VERBOSE;
optionVals[5] = (void*) 1;
// Max # of registers/pthread
options[6] = CU_JIT_MAX_REGISTERS;
int jitRegCount = maxrregcount;
optionVals[6] = (void *)(size_t)jitRegCount;
// Caching
options[7] = CU_JIT_CACHE_MODE;
optionVals[7] = (void *)CU_JIT_CACHE_OPTION_CA;
// Create a pending linker invocation
// Create a pending linker invocation
checkCudaErrors(cuLinkCreate(nOptions,options, optionVals, lState));
#if 0
if (sizeof(void *)==4)
{
// Load the PTX from the string myPtx32
printf("Loading myPtx32[] program\n");
// PTX May also be loaded from file, as per below.
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)myPtx32, strlen(myPtx32)+1, 0, 0, 0, 0);
}
else
#endif
{
// Load the PTX from the string myPtx (64-bit)
if (print_log)
fprintf(stderr, "Loading ptx..\n");
myErr = cuLinkAddData(*lState, CU_JIT_INPUT_PTX, (void*)module, strlen(module)+1, 0, 0, 0, 0);
myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_LIBRARY, cudadevrt_lib, 0,0,0);
// PTX May also be loaded from file, as per below.
// myErr = cuLinkAddFile(*lState, CU_JIT_INPUT_PTX, "myPtx64.ptx",0,0,0);
}
// Complete the linker step
myErr = cuLinkComplete(*lState, &cuOut, &outSize);
if ( myErr != CUDA_SUCCESS )
{
// Errors will be put in error_log, per CU_JIT_ERROR_LOG_BUFFER option above.
fprintf(stderr,"PTX Linker Error:\n%s\n",error_log);
assert(0);
}
// Linker walltime and info_log were requested in options above.
if (print_log)
fprintf(stderr, "CUDA Link Completed in %fms. Linker Output:\n%s\n",walltime,info_log);
// Load resulting cuBin into module
checkCudaErrors(cuModuleLoadData(&cudaModule, cuOut));
// Destroy the linker invocation
checkCudaErrors(cuLinkDestroy(*lState));
return cudaModule;
}
static void unloadModule(CUmodule &cudaModule)
{
checkCudaErrors(cuModuleUnload(cudaModule));
}
static CUfunction getFunction(CUmodule &cudaModule, const char * function)
{
CUfunction cudaFunction;
checkCudaErrors(cuModuleGetFunction(&cudaFunction, cudaModule, function));
return cudaFunction;
}
static CUdeviceptr deviceMalloc(const size_t size)
{
CUdeviceptr d_buf;
checkCudaErrors(cuMemAlloc(&d_buf, size));
return d_buf;
}
static void deviceFree(CUdeviceptr d_buf)
{
checkCudaErrors(cuMemFree(d_buf));
}
static void memcpyD2H(void * h_buf, CUdeviceptr d_buf, const size_t size)
{
checkCudaErrors(cuMemcpyDtoH(h_buf, d_buf, size));
}
static void memcpyH2D(CUdeviceptr d_buf, void * h_buf, const size_t size)
{
checkCudaErrors(cuMemcpyHtoD(d_buf, h_buf, size));
}
#define deviceLaunch(func,params) \
checkCudaErrors(cuFuncSetCacheConfig((func), CU_FUNC_CACHE_PREFER_L1)); \
checkCudaErrors( \
cuLaunchKernel( \
(func), \
1,1,1, \
32, 1, 1, \
0, NULL, (params), NULL \
));
typedef CUdeviceptr devicePtr;
/**************/
#include <vector>
static std::vector<char> readBinary(const char * filename, const bool print_size = false)
{
std::vector<char> buffer;
FILE *fp = fopen(filename, "rb");
if (!fp )
{
fprintf(stderr, "file %s not found\n", filename);
assert(0);
}
fseek(fp, 0, SEEK_END);
const unsigned long long size = ftell(fp); /*calc the size needed*/
fseek(fp, 0, SEEK_SET);
buffer.resize(size);
if (fp == NULL){ /*ERROR detection if file == empty*/
fprintf(stderr, "Error: There was an Error reading the file %s \n",filename);
exit(1);
}
else if (fread(&buffer[0], sizeof(char), size, fp) != size){ /* if count of read bytes != calculated size of .bin file -> ERROR*/
fprintf(stderr, "Error: There was an Error reading the file %s \n", filename);
exit(1);
}
if (print_size)
fprintf(stderr, " read buffer of size= %d bytes \n", (int)buffer.size());
return buffer;
}
static double CUDALaunch(
void **handlePtr,
const char * func_name,
void **func_args,
const bool print_log = true,
const int maxrregcount = 64,
const char kernel_file[] = "__kernels.ptx",
const char cudadevrt_lib[] = "libcudadevrt.a",
const int log_size = 32768)
{
fprintf(stderr, " launching kernel: %s \n", func_name);
const std::vector<char> module_str = readBinary(kernel_file, print_log);
const char * module = &module_str[0];
CUmodule cudaModule = loadModule(module, maxrregcount, cudadevrt_lib, log_size, print_log);
CUfunction cudaFunction = getFunction(cudaModule, func_name);
deviceLaunch(cudaFunction, func_args);
checkCudaErrors(cuStreamSynchronize(0));
unloadModule(cudaModule);
return 0.0;
}
/******************************/
extern "C" {
// extern int width();
int width() { return 32; }
extern void f_v(float *result);
extern void f_f(float *result, float *a);
extern void f_fu(float *result, float *a, float b);
extern void f_fi(float *result, float *a, int *b);
extern void f_du(float *result, double *a, double b);
extern void f_duf(float *result, double *a, float b);
extern void f_di(float *result, double *a, int *b);
extern void result(float *val);
}
#if defined(_WIN32) || defined(_WIN64)
#define ALIGN
#else
#define ALIGN __attribute__((aligned(64)))
#endif
int main(int argc, char *argv[]) {
int w = width();
assert(w <= 64);
float returned_result[64] ALIGN;
float vfloat[64] ALIGN;
double vdouble[64] ALIGN;
int vint[64] ALIGN;
int vint2[64] ALIGN;
const int device = 0;
#if 0
const bool verbose = true;
#else
const bool verbose = false;
#endif
/*******************/
createContext(device, verbose);
/*******************/
devicePtr d_returned_result = deviceMalloc(64*sizeof(float));
devicePtr d_vfloat = deviceMalloc(64*sizeof(float));
devicePtr d_vdouble = deviceMalloc(64*sizeof(double));
devicePtr d_vint = deviceMalloc(64*sizeof(int));
devicePtr d_vint2 = deviceMalloc(64*sizeof(int));
for (int i = 0; i < 64; ++i) {
returned_result[i] = -1e20;
vfloat[i] = i+1;
vdouble[i] = i+1;
vint[i] = 2*(i+1);
vint2[i] = i+5;
}
memcpyH2D(d_returned_result, returned_result, 64*sizeof(float));
memcpyH2D(d_vfloat , vfloat, 64*sizeof(float));
memcpyH2D(d_vdouble , vdouble, 64*sizeof(double));
memcpyH2D(d_vint , vint, 64*sizeof(int));
memcpyH2D(d_vint2 , vint2, 64*sizeof(int));
float b = 5.;
const bool print_log = false;
const int nreg = 64;
#if (TEST_SIG == 0)
void *args[] = {&d_returned_result};
CUDALaunch(NULL, "f_v", args, print_log, nreg);
#elif (TEST_SIG == 1)
void *args[] = {&d_returned_result, &d_vfloat};
CUDALaunch(NULL, "f_f", args, print_log, nreg);
#elif (TEST_SIG == 2)
void *args[] = {&d_returned_result, &d_vfloat, &b};
CUDALaunch(NULL, "f_fu", args, print_log, nreg);
#elif (TEST_SIG == 3)
void *args[] = {&d_returned_result, &d_vfloat, &vint};
CUDALaunch(NULL, "f_fi", args, print_log, nreg);
#elif (TEST_SIG == 4)
int num = 5;
void *args[] = {&d_returned_result, &d_vdouble, &num};
CUDALaunch(NULL, "f_du", args, print_log, nreg);
#elif (TEST_SIG == 5)
float num = 5.0f;
void *args[] = {&d_returned_result, &d_vdouble, &num};
CUDALaunch(NULL, "f_duf", args, print_log, nreg);
#elif (TEST_SIG == 6)
void *args[] = {&d_returned_result, &d_vdouble, &v_int2};
CUDALaunch(NULL, "f_di", args, print_log, nreg);
#else
#error "Unknown or unset TEST_SIG value"
#endif
float expected_result[64];
memset(expected_result, 0, 64*sizeof(float));
devicePtr d_expected_result = deviceMalloc(64*sizeof(float));
memcpyH2D(d_expected_result, expected_result, 64*sizeof(float));
void *res_args[] = {&d_expected_result};
CUDALaunch(NULL, "result", res_args, print_log, nreg);
memcpyD2H(expected_result, d_expected_result, 64*sizeof(float));
memcpyD2H(returned_result, d_returned_result, 64*sizeof(float));
deviceFree(d_returned_result);
deviceFree(d_vfloat);
deviceFree(d_vdouble);
deviceFree(d_vint);
deviceFree(d_vint2);
deviceFree(d_expected_result);
/*******************/
destroyContext();
/*******************/
int errors = 0;
for (int i = 0; i < w; ++i) {
if (returned_result[i] != expected_result[i]) {
#ifdef EXPECT_FAILURE
// bingo, failed
return 1;
#else
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
argv[0], i, returned_result[i], returned_result[i],
expected_result[i], expected_result[i]);
++errors;
#endif // EXPECT_FAILURE
}
}
#ifdef EXPECT_FAILURE
// Don't expect to get here
return 0;
#else
return errors > 0;
#endif
}

133
test_static_nvptx.cpp Normal file
View File

@@ -0,0 +1,133 @@
/*
Copyright (c) 2010-2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if defined(_WIN32) || defined(_WIN64)
#define ISPC_IS_WINDOWS
#elif defined(__linux__)
#define ISPC_IS_LINUX
#elif defined(__APPLE__)
#define ISPC_IS_APPLE
#endif
#ifdef ISPC_IS_WINDOWS
#include <windows.h>
#endif // ISPC_IS_WINDOWS
#include <cassert>
#include <cstring>
#include <cstdio>
#include <stdint.h>
#ifdef ISPC_IS_LINUX
#include <malloc.h>
#endif
#include "ispc_malloc.h"
#define N 32
extern "C" {
int width() { return N; }
extern void f_v(float *result);
extern void f_f(float *result, float *a);
extern void f_fu(float *result, float *a, float b);
extern void f_fi(float *result, float *a, int *b);
extern void f_du(float *result, double *a, double b);
extern void f_duf(float *result, double *a, float b);
extern void f_di(float *result, double *a, int *b);
extern void result(float *val);
}
int main(int argc, char *argv[]) {
int w = width();
assert(w <= N);
float *returned_result = new float[N*4];
float *vfloat = new float[N*4];
double *vdouble = new double[N*4];
int *vint = new int[N*4];
int *vint2 = new int[N*4];
for (int i = 0; i < N*4; ++i) {
returned_result[i] = -1e20;
vfloat[i] = i+1;
vdouble[i] = i+1;
vint[i] = 2*(i+1);
vint2[i] = i+5;
}
float b = 5.;
#if (TEST_SIG == 0)
f_v(returned_result);
#elif (TEST_SIG == 1)
f_f(returned_result, vfloat);
#elif (TEST_SIG == 2)
f_fu(returned_result, vfloat, b);
#elif (TEST_SIG == 3)
f_fi(returned_result, vfloat, vint);
#elif (TEST_SIG == 4)
f_du(returned_result, vdouble, 5.);
#elif (TEST_SIG == 5)
f_duf(returned_result, vdouble, 5.f);
#elif (TEST_SIG == 6)
f_di(returned_result, vdouble, vint2);
#else
#error "Unknown or unset TEST_SIG value"
#endif
float *expected_result = new float[N];
memset(expected_result, 0, N*sizeof(float));
result(expected_result);
int errors = 0;
for (int i = 0; i < w; ++i) {
if (returned_result[i] != expected_result[i])
{
#ifdef EXPECT_FAILURE
// bingo, failed
return 1;
#else
printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n",
argv[0], i, returned_result[i], returned_result[i],
expected_result[i], expected_result[i]);
++errors;
#endif // EXPECT_FAILURE
}
}
#ifdef EXPECT_FAILURE
// Don't expect to get here
return 0;
#else
return errors > 0;
#endif
}

View File

@@ -5,7 +5,13 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
assert(programCount <= 64);
#ifdef __NVPTX__
uniform float * uniform xarr = uniform new uniform float[70*70];
uniform float (* uniform x)[70] = (uniform float (* uniform)[70])xarr;
#define _SHMALLOC
#else
uniform float x[70][70];
#endif
for (uniform int i = 0; i < 70; ++i)
for (uniform int j = 0; j < 70; ++j)
x[i][j] = 2+b-5;
@@ -16,6 +22,10 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
else
x[b-1][a-1] = 1;
RET[programIndex] = x[4][a];
#ifdef _SHMALLOC
delete xarr;
#endif
}
export void result(uniform float RET[]) {

View File

@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = (programCount == 1) ? 3 : broadcast(a, 2);
float b = (programCount == 1) ? 4 : broadcast(a, 2);
RET[programIndex] = b;
}

View File

@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 2;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 5;
RET[3] = RET[7] = RET[11] = RET[15] = 6;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 2;
RET[i+1] = 3;
RET[i+2] = 5;
RET[i+3] = 6;
}
}

View File

@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
export void result(uniform float RET[]) {
RET[programIndex] = 3;
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+3] = 29;
}
}

View File

@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[programIndex] = 32;
RET[2] = RET[6] = RET[10] = RET[14] = 38;
RET[3] = RET[7] = RET[11] = RET[15] = 39;
for (int i = 0; i < programCount; i += 4)
{
RET[i+2] = 38;
RET[i+3] = 39;
}
}

View File

@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
struct Foo {
uniform float x[17];
uniform float x[programCount+1];
};
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform Foo foo;
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo.x[i] = i;
if ((int)a & 1)

View File

@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform double udx[25][25];
cfor (uniform int i = 0; i < 25; ++i)
cfor (uniform int j = 0; j < 25; ++j)
uniform double udx[programCount+1][programCount+1];
cfor (uniform int i = 0; i < programCount+1; ++i)
cfor (uniform int j = 0; j < programCount+1; ++j)
udx[i][j] = 10*i+j;
int x = 1;

View File

@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform float udx[20][20];
cfor (uniform int i = 0; i < 20; ++i)
cfor (uniform int j = 0; j < 20; ++j)
uniform float udx[programCount+1][programCount+1];
cfor (uniform int i = 0; i < programCount+1; ++i)
cfor (uniform int j = 0; j < programCount+1; ++j)
udx[i][j] = 100*i+j;
int x = 1;

View File

@@ -4,19 +4,27 @@ export uniform int width() { return programCount; }
struct Foo {
uniform float udx[25][25];
uniform float udx[32][32];
};
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
#ifndef __NVPTX__
uniform Foo f[5];
#else /* too much shared memory allocated, nvcc fails to link */
uniform Foo * uniform f = uniform new uniform Foo[5];
#define _UNMALLOC
#endif
cfor (uniform int i = 0; i < 5; ++i)
cfor (uniform int j = 0; j < 25; ++j)
cfor (uniform int k = 0; k < 25; ++k)
cfor (uniform int j = 0; j < 32; ++j)
cfor (uniform int k = 0; k < 32; ++k)
f[i].udx[j][k] = 1000*i+100*j+k;
int x = 1;
RET[programIndex] = f[x+1].udx[b-4][programIndex];
#ifdef _UNMALLOC
delete f;
#endif
}
export void result(uniform float RET[]) { RET[programIndex] = 2100 +programIndex; }

View File

@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = func(foo, (int)a);
}

View File

@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = func(foo, (int)a);
}

View File

@@ -9,9 +9,9 @@ struct Foo {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = foo[(int)a].f;
}

View File

@@ -10,9 +10,9 @@ struct Foo {
export void f_fi(uniform float RET[], uniform float aFOO[], uniform int bFOO[]) {
float a = aFOO[programIndex];
int b = bFOO[programIndex];
varying Foo myFoo[17];
varying Foo myFoo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i) {
cfor (i = 0; i < programCount+1; ++i) {
myFoo[i].x = i;
myFoo[i].f = 2*i;
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 2;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 5;
RET[3] = RET[7] = RET[11] = RET[15] = 6;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 2;
RET[i+1] = 3;
RET[i+2] = 5;
RET[i+3] = 6;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -18,8 +18,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 32;
RET[1] = RET[5] = RET[9] = RET[13] = 32;
RET[2] = RET[6] = RET[10] = RET[14] = 38;
RET[3] = RET[7] = RET[11] = RET[15] = 39;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 32;
RET[i+1] = 32;
RET[i+2] = 38;
RET[i+3] = 39;
}
}

View File

@@ -8,9 +8,9 @@ struct Foo {
};
export void f_fi(uniform float RET[], uniform float a[], uniform int bFOO[]) {
int b = bFOO[programIndex];
uniform struct Foo myFoo[17];
uniform struct Foo myFoo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i) {
cfor (i = 0; i < programCount+1; ++i) {
myFoo[i].x = i;
myFoo[i].f = 2*i;
}

View File

@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
uniform int x = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
const static uniform int y = (1 << 4) - ~0xf0f0f0f0 + (2 * 8 / 2);
RET[programIndex] = (x == y) ? 1. : 0.;
}

View File

@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
uniform int x = (170 >> 4) % 5;
static uniform int y = (170 >> 4) % 5;
const static uniform int y = (170 >> 4) % 5;
RET[programIndex] = (x == y) ? 1. : 0.;
}

View File

@@ -6,7 +6,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
uniform int x = (17 < 2) || (6 >= 5) && (20 >= 20);
static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
const static uniform int y = (17 < 2) || (6 >= 5) && (20 >= 20);
RET[programIndex] = ((x!=0) == (y!=0)) ? 1. : 0.;
}

View File

@@ -2,22 +2,23 @@
export uniform int width() { return programCount; }
#define N0 10
#define N0 12
#define N1 20
#define N2 50
static uniform float array[N2][N1][N0];
task void x(const float f) {
task void x(const uniform float farray[]) {
const float f = farray[programIndex];
uniform int j;
assert(taskCount == (int32)N0*N1*N2);
assert(taskCount0 == (int32)N0);
assert(taskCount1 == (int32)N1);
assert(taskCount2 == (int32)N2);
assert(taskIndex == (int32)taskIndex0 + (int32)N0*(taskIndex1 +(int32) N1*taskIndex2));
assert(taskIndex0 < (int32)N0);
assert(taskIndex1 < (int32)N1);
assert(taskIndex2 < (int32)N2);
assert(taskCount == (uniform int32)N0*N1*N2);
assert(taskCount0 == (uniform int32)N0);
assert(taskCount1 == (uniform int32)N1);
assert(taskCount2 == (uniform int32)N2);
assert(taskIndex == (uniform int32)taskIndex0 + (uniform int32)N0*(taskIndex1 +(uniform int32) N1*taskIndex2));
assert(taskIndex0 < (uniform int32)N0);
assert(taskIndex1 < (uniform int32)N1);
assert(taskIndex2 < (uniform int32)N2);
const uniform int i0 = taskIndex0;
const uniform int i1 = taskIndex1;
@@ -30,7 +31,7 @@ task void x(const float f) {
array[i2][i1][i0] = i;
}
export void f_f(uniform float RET[], uniform float fFOO[]) {
float f = fFOO[programIndex];
uniform float * uniform f = fFOO;
launch[N2][N1][N0] x(f);
sync;
RET[programIndex] = array[N2-1][N1-1][N0-1];
@@ -38,5 +39,5 @@ export void f_f(uniform float RET[], uniform float fFOO[]) {
export void result(uniform float RET[]) {
RET[programIndex] = 9999.000000;
RET[programIndex] = 11999.000000;
}

View File

@@ -2,12 +2,13 @@
export uniform int width() { return programCount; }
#define N0 10
#define N0 12
#define N1 20
#define N2 50
static uniform float array[N2][N1][N0];
task void x(const float f) {
task void x(const uniform float farray[]) {
const float f = farray[programIndex];
uniform int j;
assert(taskCount == (int32)N0*N1*N2);
@@ -30,13 +31,13 @@ task void x(const float f) {
array[i2][i1][i0] = i;
}
export void f_f(uniform float RET[], uniform float fFOO[]) {
float f = fFOO[programIndex];
launch[N0,N1,N2] x(f);
uniform float * uniform f = fFOO;
launch[N2][N1][N0] x(f);
sync;
RET[programIndex] = array[N2-1][N1-1][N0-1];
}
export void result(uniform float RET[]) {
RET[programIndex] = 9999.000000;
RET[programIndex] = 11999.000000;
}

View File

@@ -1,4 +1,9 @@
#ifdef __NVPTX__
uniform int _off[programCount];
#define off _off[programIndex]
#else /* global varying data types are not yet supported with "nvptx" target */
int off;
#endif
export uniform int width() { return programCount; }
@@ -22,11 +27,11 @@ struct S operator/(struct S rr, struct S rv) {
return c;
}
struct S a;
struct S b;
struct S d;
export void f_f(uniform float RET[], uniform float aFOO[]) {
struct S a;
struct S b;
struct S d;
int T = programIndex;
a.a = aFOO[programIndex];
b.a = -aFOO[programIndex];

View File

@@ -15,6 +15,16 @@ static void p(uniform float *uniform ptr) {
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
soa<4> Point pts[10];
for (uniform int i = 0; i < 40; ++i) {
pts[i].x = b*i;
pts[i].y[0] = 2*b*i;
pts[i].y[1] = 2*b*i+1;
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#else
soa<4> Point pts[30];
for (uniform int i = 0; i < 120; ++i) {
pts[i].x = b*i;
@@ -23,6 +33,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#endif
float a = aFOO[programIndex];
a *= -1;

View File

@@ -16,6 +16,16 @@ static void p(uniform float *uniform ptr) {
}
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
soa<4> Point pts[15];
for (uniform int i = 0; i < 60; ++i) {
pts[i].x = b*i;
pts[i].y[0] = 2*b*i;
pts[i].y[1] = 2*b*i+1;
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#else
soa<4> Point pts[40];
for (uniform int i = 0; i < 160; ++i) {
pts[i].x = b*i;
@@ -24,6 +34,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#endif
float a = aFOO[programIndex];
a *= -1;

View File

@@ -25,7 +25,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
}
}
}
assert(programIndex < 80);
RET[programIndex] = pts[programIndex].pts[programIndex % 3][programIndex % 4].z;
}

View File

@@ -6,6 +6,17 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
#ifdef __NVPTX__ /* soa is converted to shared memory story for now, use smaller amount to check the test */
soa<8> Point pts[4];
//CO uniform Point pts[80];
foreach (i = 0 ... 40) {
pts[i].x = b*i;
pts[i].y[0] = 2*b*i;
pts[i].y[1] = 2*b*i+1;
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#else
soa<8> Point pts[10];
//CO uniform Point pts[80];
foreach (i = 0 ... 80) {
@@ -15,6 +26,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
pts[i].y[2] = 2*b*i+2;
pts[i].z = 3*b*i;
}
#endif
assert(programCount < 80);
RET[programIndex] = pts[programIndex].y[2];

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
}
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0;
RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 0x0.0p+0;
RET[i+1] = 0x1.62e43p-1;
RET[i+2] = 0x1.193ea8p+0;
RET[i+3] = 0x1.62e43p+0;
}
}

View File

@@ -5,7 +5,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
// calculation error 1e-6 is the same as in icc
RET[programIndex] = (exp(-log(1/a)) - a) < 1e-6 ? 1 : 0;
RET[programIndex] = (exp(-log(1/a)) - a)/a < 1e-6 ? 1 : 0;
}
export void result(uniform float RET[4]) {

View File

@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
RET[programIndex] = round(a+.499999);
RET[programIndex] = round(a+.49999);
}
export void result(uniform float RET[]) {

View File

@@ -4,7 +4,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
RET[programIndex] = floor(a+.999999);
RET[programIndex] = floor(a+.99999);
}
export void result(uniform float RET[]) {

34
tests/uniform-1.ispc Normal file
View File

@@ -0,0 +1,34 @@
export uniform int width() { return programCount; }
task void f_f_task(uniform float RET[], uniform float aFOO[]) {
uniform float val[programCount];
for (uniform int i = 0; i < programCount; ++i)
val[i] = 0;
foreach (i = 0 ... programCount)
val[i] += aFOO[programCount*taskIndex + i] - 1;
uniform float sum = 0;
for (uniform int i = 0; i < programCount; ++i)
sum += val[i];
if (programIndex < 32/4)
RET[programCount/4*taskIndex + programIndex] = sum;
}
export void f_f(uniform float RET[], uniform float aFOO[])
{
launch[4] f_f_task(RET, aFOO);
}
task void result_task(uniform float RET[])
{
const uniform float ret = reduce_add(programIndex + programCount*taskIndex);
if (programIndex < 32/4)
RET[programCount/4*taskIndex + programIndex] = ret;
}
export void result(uniform float RET[]) {
launch[4] result_task(RET);
}

View File

@@ -749,7 +749,7 @@ EnumType::Mangle() const {
std::string ret;
if (isConst) ret += "C";
ret += variability.MangleString();
ret += std::string("enum[") + name + std::string("]");
ret += std::string("enum_5B_") + name + std::string("_5C_");
return ret;
}
@@ -1420,7 +1420,7 @@ ArrayType::Mangle() const {
sprintf(buf, "%d", numElements);
else
buf[0] = '\0';
return s + "[" + buf + "]";
return s + "_5B_" + buf + "_5C_";
}
@@ -2058,12 +2058,12 @@ lMangleStruct(Variability variability, bool isConst, const std::string &name) {
Assert(variability != Variability::Unbound);
std::string ret;
ret += "s[";
ret += "s_5B_";
if (isConst)
ret += "_c_";
ret += variability.MangleString();
ret += name + std::string("]");
ret += name + std::string("_5C_");
return ret;
}
@@ -3009,7 +3009,7 @@ FunctionType::LLVMFunctionType(llvm::LLVMContext *ctx, bool removeMask) const {
llvmArgTypes.push_back(LLVMTypes::MaskType);
std::vector<llvm::Type *> callTypes;
if (isTask) {
if (isTask && g->target->getISA() != Target::NVPTX) {
// Tasks take three arguments: a pointer to a struct that holds the
// actual task arguments, the thread index, and the total number of
// threads the tasks system has running. (Task arguments are