+fixed some example, found some bugs, and bugs in ptxas/cuda

This commit is contained in:
Evghenii
2014-01-21 14:51:27 +01:00
parent 5a773ed62a
commit bc99897fbb
22 changed files with 98 additions and 58 deletions

View File

@@ -36,7 +36,7 @@
# If you have your own special version of llvm and/or clang, change
# these variables to match.
LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-trunk/bin/llvm-config)
LLVM_CONFIG=$(shell which $(HOME)/usr/local/llvm/bin-3.4/bin/llvm-config)
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
# Enable ARM by request

View File

@@ -653,9 +653,19 @@ define i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x float> %v, i32 0
ret float %r
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
%value = extractelement <1 x float> %v, i32 0
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
%call1 = fadd float %call, %value
%call.1 = tail call float @__shfl_xor_float_nvptx(float %call1, i32 8)
%call1.1 = fadd float %call1, %call.1
%call.2 = tail call float @__shfl_xor_float_nvptx(float %call1.1, i32 4)
%call1.2 = fadd float %call1.1, %call.2
%call.3 = tail call float @__shfl_xor_float_nvptx(float %call1.2, i32 2)
%call1.3 = fadd float %call1.2, %call.3
%call.4 = tail call float @__shfl_xor_float_nvptx(float %call1.3, i32 1)
%call1.4 = fadd float %call1.3, %call.4
ret float %call1.4
}
define float @__reduce_min_float(<1 x float>) nounwind readnone {

View File

@@ -1410,7 +1410,7 @@ FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) {
llvm::Value *
FunctionEmitContext::ProgramIndexVector(bool is32bits) {
if (g->target->getISA() != Target::NVPTX)
if (1 || g->target->getISA() != Target::NVPTX)
{
llvm::SmallVector<llvm::Constant*, 16> array;
for (int i = 0; i < g->target->getVectorWidth() ; ++i) {

View File

@@ -10,7 +10,7 @@ DEPTX=dePTX
NVCC=nvcc
$DEPTX < $PTXSRC > $PTXCU &&
$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
$NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
sed 's/\#\$//g'| \
awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
sh $PTXSH

View File

@@ -512,6 +512,7 @@ Function::GenerateIR() {
if (g->target->getISA() == Target::NVPTX)
{
functionName += std::string("___export"); /* add ___export to the end, for ptxcc to recognize it is exported */
#if 0
llvm::NamedMDNode* annotations =
m->module->getOrInsertNamedMetadata("nvvm.annotations");
llvm::SmallVector<llvm::Value*, 3> av;
@@ -519,6 +520,7 @@ Function::GenerateIR() {
av.push_back(llvm::MDString::get(*g->ctx, "kernel"));
av.push_back(llvm::ConstantInt::get(llvm::IntegerType::get(*g->ctx,32), 1));
annotations->addOperand(llvm::MDNode::get(*g->ctx, av));
#endif
}
llvm::Function *appFunction =
llvm::Function::Create(ftype, linkage, functionName.c_str(), m->module);

View File

@@ -427,15 +427,6 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
return;
}
#if 0
if (g->target->getISA() == Target::NVPTX &&
type->IsVaryingType())
{
Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
return;
}
#endif
if (Type::Equal(type, AtomicType::Void)) {
Error(pos, "\"void\" type global variable is illegal.");
return;
@@ -453,6 +444,17 @@ Module::AddGlobalVariable(const std::string &name, const Type *type, Expr *initE
return;
}
#if 1
if (g->target->getISA() == Target::NVPTX &&
at != NULL &&
type->IsVaryingType())
{
Error(pos, "Global \"varying\" variables are not yet supported in \"nvptx\" target.");
return;
}
#endif
llvm::Type *llvmType = type->LLVMType(g->ctx);
if (llvmType == NULL)
return;
@@ -2130,6 +2132,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
opts.addMacroDef("cif=if");
opts.addMacroDef("cfor=for");
opts.addMacroDef("cwhile=while");
opts.addMacroDef("ccontinue=continue");
opts.addMacroDef("cdo=do");
opts.addMacroDef("taskIndex=blockIndex0()");
opts.addMacroDef("taskCount=blockCount0()");

View File

@@ -497,7 +497,11 @@ Optimize(llvm::Module *module, int optLevel) {
// run absolutely no optimizations, since the front-end needs us to
// take the various __pseudo_* functions it has emitted and turn
// them into something that can actually execute.
optPM.add(CreateImproveMemoryOpsPass(), 100);
if (g->opt.disableGatherScatterOptimizations == false &&
g->target->getVectorWidth() > 1)
optPM.add(CreateImproveMemoryOpsPass(), 100);
if (g->opt.disableHandlePseudoMemoryOps == false)
optPM.add(CreateReplacePseudoMemoryOpsPass());

View File

@@ -257,7 +257,7 @@ def run_test(testname):
cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
(nvptxcc_exe_rel, obj_name, match, exe_name)
ispc_cmd = ispc_exe_rel + " --woff %s -o %s --arch=%s --target=%s" % \
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --arch=%s --target=%s" % \
(filename, obj_name, options.arch, options.target)
if options.no_opt:
ispc_cmd += " -O0"
@@ -271,7 +271,7 @@ def run_test(testname):
print "Grepping: %s" % grep_cmd
sp = subprocess.Popen(grep_cmd, shell=True)
sp.communicate()
ispc_cmd = ispc_exe_rel + " --woff %s -o %s --emit-asm --target=%s" % \
ispc_cmd = ispc_exe_rel + " --woff %s -o %s -O3 --emit-asm --target=%s" % \
(filename4ptx, obj_name, options.target)
# compile the ispc code, make the executable, and run it...
@@ -287,7 +287,7 @@ def run_test(testname):
basename = os.path.basename(filename)
os.unlink("%s.pdb" % basename)
os.unlink("%s.ilk" % basename)
os.unlink(obj_name)
# os.unlink(obj_name)
except:
None

View File

@@ -3,7 +3,7 @@ export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
float a = aFOO[programIndex];
float b = (programCount == 1) ? 3 : broadcast(a, 2);
float b = (programCount == 1) ? 4 : broadcast(a, 2);
RET[programIndex] = b;
}

View File

@@ -19,8 +19,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 2;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 5;
RET[3] = RET[7] = RET[11] = RET[15] = 6;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 2;
RET[i+1] = 3;
RET[i+2] = 5;
RET[i+3] = 6;
}
}

View File

@@ -18,6 +18,9 @@ export void f_fu(uniform float RET[4], uniform float aFOO[4], uniform float b) {
export void result(uniform float RET[]) {
RET[programIndex] = 3;
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+3] = 29;
}
}

View File

@@ -19,6 +19,9 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[programIndex] = 32;
RET[2] = RET[6] = RET[10] = RET[14] = 38;
RET[3] = RET[7] = RET[11] = RET[15] = 39;
for (int i = 0; i < programCount; i += 4)
{
RET[i+2] = 38;
RET[i+3] = 39;
}
}

View File

@@ -4,14 +4,14 @@ export uniform int width() { return programCount; }
struct Foo {
uniform float x[17];
uniform float x[programCount+1];
};
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform Foo foo;
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo.x[i] = i;
if ((int)a & 1)

View File

@@ -4,9 +4,9 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform double udx[25][25];
cfor (uniform int i = 0; i < 25; ++i)
cfor (uniform int j = 0; j < 25; ++j)
uniform double udx[programCount+1][programCount+1];
cfor (uniform int i = 0; i < programCount+1; ++i)
cfor (uniform int j = 0; j < programCount+1; ++j)
udx[i][j] = 10*i+j;
int x = 1;

View File

@@ -5,9 +5,9 @@ export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
uniform float udx[20][20];
cfor (uniform int i = 0; i < 20; ++i)
cfor (uniform int j = 0; j < 20; ++j)
uniform float udx[programCount+1][programCount+1];
cfor (uniform int i = 0; i < programCount+1; ++i)
cfor (uniform int j = 0; j < programCount+1x; ++j)
udx[i][j] = 100*i+j;
int x = 1;

View File

@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = func(foo, (int)a);
}

View File

@@ -13,9 +13,9 @@ float func(Foo foo[], int offset) {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = func(foo, (int)a);
}

View File

@@ -9,9 +9,9 @@ struct Foo {
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float a = aFOO[programIndex];
Foo foo[17];
Foo foo[programCount+1];
uniform int i;
cfor (i = 0; i < 17; ++i)
cfor (i = 0; i < programCount+1; ++i)
foo[i].f = i*a;
RET[programIndex] = foo[(int)a].f;
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -17,8 +17,11 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 1;
RET[1] = RET[5] = RET[9] = RET[13] = 3;
RET[2] = RET[6] = RET[10] = RET[14] = 3;
RET[3] = RET[7] = RET[11] = RET[15] = 29;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 1;
RET[i+1] = 3;
RET[i+2] = 3;
RET[i+3] = 29;
}
}

View File

@@ -8,8 +8,11 @@ export void f_f(uniform float RET[], uniform float aFOO[]) {
}
export void result(uniform float RET[]) {
RET[0] = RET[4] = RET[8] = RET[12] = 0x0.0p+0;
RET[1] = RET[5] = RET[9] = RET[13] = 0x1.62e43p-1;
RET[2] = RET[6] = RET[10] = RET[14] = 0x1.193ea8p+0;
RET[3] = RET[7] = RET[11] = RET[15] = 0x1.62e43p+0;
for (int i = 0; i < programCount; i += 4)
{
RET[i+0] = 0x0.0p+0;
RET[i+1] = 0x1.62e43p-1;
RET[i+2] = 0x1.193ea8p+0;
RET[i+3] = 0x1.62e43p+0;
}
}