diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll index 43a16987..1901ba5e 100644 --- a/builtins/target-nvptx.ll +++ b/builtins/target-nvptx.ll @@ -705,6 +705,19 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline ret float %ret } +define double @__rsqrt_uniform_double(double) nounwind readonly alwaysinline +{ + %ret1 = call double @llvm.sqrt.f64(double %0) + %ret = fdiv double 1., %ret1 + ret double %ret +} +define double @__rcp_uniform_double(double) nounwind readonly alwaysinline +{ + %ret = fdiv double 1., %0 + ret double %ret +} + + define @__rcp_varying_float() nounwind readnone alwaysinline { %v = extractelement <1 x float> %0, i32 0 @@ -712,6 +725,13 @@ define @__rcp_varying_float() nounwind readnone %rv = insertelement <1 x float> undef, float %r, i32 0 ret %rv } +define @__rcp_varying_double() nounwind readnone alwaysinline +{ + %v = extractelement <1 x double> %0, i32 0 + %r = call double @__rcp_uniform_double(double %v) + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret %rv +} define @__rsqrt_varying_float() nounwind readnone alwaysinline { %v = extractelement <1 x float> %0, i32 0 @@ -719,6 +739,14 @@ define @__rsqrt_varying_float() nounwind readnone %rv = insertelement <1 x float> undef, float %r, i32 0 ret %rv } +define @__rsqrt_varying_double() nounwind readnone alwaysinline +{ + %v = extractelement <1 x double> %0, i32 0 + %r = call double @__rsqrt_uniform_double(double %v) + %rv = insertelement <1 x double> undef, double %r, i32 0 + ret %rv +} + define @__sqrt_varying_float() nounwind readnone alwaysinline { %v = extractelement <1 x float> %0, i32 0 @@ -726,7 +754,6 @@ define @__sqrt_varying_float() nounwind readnone %rv = insertelement <1 x float> undef, float %r, i32 0 ret %rv } - ;; declare double @__sqrt_uniform_double(double) nounwind readnone define double @__sqrt_uniform_double(double) nounwind readonly alwaysinline { %ret = call double @llvm.sqrt.f64(double %0) diff --git a/examples/cuda_helpers.cuh b/examples/cuda_helpers.cuh new file mode 100644 index 00000000..2c6ec6dc --- /dev/null +++ b/examples/cuda_helpers.cuh @@ -0,0 +1,25 @@ +#pragma once + +#define programCount 32 +#define programIndex (threadIdx.x & 31) +#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5)) +#define taskCount0 (gridDim.x*4) +#define taskIndex1 (blockIdx.y) +#define taskCount1 (gridDim.y) +#define taskIndex2 (blockIdx.z) +#define taskCount2 (gridDim.z) +#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2)) +#define taskCount (taskCount0*taskCount1*taskCount2) +#define warpIdx (threadIdx.x >> 5) +#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<>> +#define sync cudaDeviceSynchronize() +#define cif if +__device__ __forceinline__ static double __shfl(double x, int lane) +{ + return __hiloint2double( + __shfl_xor(__double2hiint(x), lane), + __shfl_xor(__double2loint(x), lane)); + +} +#define shuffle(x,y) __shfl(x,y) +#define broadcast(x,y) __shfl(x,y) diff --git a/examples/ispc_malloc.cpp b/examples/ispc_malloc.cpp new file mode 100644 index 00000000..dcbe5d48 --- /dev/null +++ b/examples/ispc_malloc.cpp @@ -0,0 +1,54 @@ +#include +#include +#include +#include +#include "ispc_malloc.h" + +#ifdef _CUDA_ + +void * operator new(size_t size) throw(std::bad_alloc) +{ + void *ptr; + ispc_malloc(&ptr, size); + return ptr; +} +void operator delete(void *ptr) throw() +{ + ispc_free(ptr); +} + +#else + +void ispc_malloc(void **ptr, const size_t size) +{ + *ptr = malloc(size); +} +void ispc_free(void *ptr) +{ + free(ptr); +} +void ispc_memset(void *ptr, int value, size_t size) +{ + memset(ptr, value, size); +} +void ispcSetMallocHeapLimit(size_t value) +{ +} +void ispcSetStackLimit(size_t value) +{ +} +unsigned long long ispcGetMallocHeapLimit() +{ + return -1; +} +unsigned long long ispcGetStackLimit() +{ + return -1; +} +void * ispcMemcpy(void *dest, void *src, size_t num) +{ + memcpy(dest, src, num); + return dest; +} + +#endif diff --git a/examples/ispc_malloc.h b/examples/ispc_malloc.h new file mode 100644 index 00000000..1d63f602 --- /dev/null +++ b/examples/ispc_malloc.h @@ -0,0 +1,10 @@ +#pragma once + +extern void ispc_malloc(void **ptr, const size_t size); +extern void ispc_free(void *ptr); +extern void ispc_memset(void *ptr, int value, size_t size); +extern void ispcSetMallocHeapLimit(size_t value); +extern void ispcSetStackLimit(size_t value); +extern unsigned long long ispcGetMallocHeapLimit(); +extern unsigned long long ispcGetStackLimit(); +extern void * ispcMemcpy(void *dest, void *src, size_t num); diff --git a/examples/nvcc_helpers.cu b/examples/nvcc_helpers.cu new file mode 100644 index 00000000..e6faea96 --- /dev/null +++ b/examples/nvcc_helpers.cu @@ -0,0 +1,43 @@ +#ifndef _CUDA_ +#error "Something went wrong..." +#endif + +void ispc_malloc(void **ptr, const size_t size) +{ + cudaMallocManaged(ptr, size); +} +void ispc_free(void *ptr) +{ + cudaFree(ptr); +} +void ispc_memset(void *ptr, int value, size_t size) +{ + cudaMemset(ptr, value, size); +} +void ispcSetMallocHeapLimit(size_t value) +{ + cudaDeviceSetLimit(cudaLimitMallocHeapSize,value); +} +void ispcSetStackLimit(size_t value) +{ + cudaDeviceSetLimit(cudaLimitStackSize,value); +} +unsigned long long ispcGetMallocHeapLimit() +{ + size_t value; + cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize); + return value; +} +unsigned long long ispcGetStackLimit() +{ + size_t value; + cudaDeviceGetLimit(&value, cudaLimitStackSize); + return value; +} +void * ispcMemcpy(void *dest, void *src, size_t num) +{ + cudaMemcpy(dest, src, num, cudaMemcpyDefault); + return dest; +} + + diff --git a/nvptxcc b/nvptxcc deleted file mode 100755 index 81d622e9..00000000 --- a/nvptxcc +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH -PTXCC=ptxcc -ARGS=${@:2} -if [ "$NVVM" == "1" ]; -then - LLVM32=$HOME/usr/local/llvm/bin-3.2 - LLVMDIS=$LLVM32/bin/llvm-dis - PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen - $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \ - $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \ - $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS); -else - $($PTXCC $1 -o $1.o -Xnvcc="-G") && \ - $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS); -fi - - - diff --git a/ptxtools/Makefile b/ptxtools/Makefile new file mode 100644 index 00000000..66a108c6 --- /dev/null +++ b/ptxtools/Makefile @@ -0,0 +1,49 @@ +all: ptxcc ptxgen + +CXX=clang++ +CXXFLAGS += -O3 +CXXFLAGS += -I/opt/local/include + +LD=clang++ +LDFLAGS += -L/opt/local/lib + +FLEX=flex +BISON=bison + +CUDATK=/usr/local/cuda +LIBDEVICE_MAJOR=1 +LIBDEVICE_MINOR=0 + +ptxgrammar.cc : ptxgrammar.yy + $(BISON) -d -v -t ptxgrammar.yy -o ptxgrammar.cc + +ptx.cc: ptx.ll ptxgrammar.cc + $(FLEX) -t ptx.ll > ptx.cc + +%.o: %.cc + $(CXX) $(CXXFLAGS) -c $< -o $@ + +%.o: %.cpp + $(CXX) $(CXXFLAGS) -c $< -o $@ + +OBJ= ptxcc.o \ + ptx.o \ + ptxgrammar.o + +ptxcc: $(OBJ) + $(LD) $(LDFLAGS) $^ -o $@ + +ptxgen: ptxgen.cpp + $(CXX) $(CXXFLAGS) -o $@ $< \ + -L$(CUDATK)/nvvm/lib64 -lnvvm \ + -I$(CUDATK)/nvvm/include \ + -I$(CUDATK)/include \ + -DLIBDEVICE_MAJOR_VERSION=$(LIBDEVICE_MAJOR) \ + -DLIBDEVICE_MINOR_VERSION=$(LIBDEVICE_MINOR) \ + -DLIBNVVM_HOME=$(CUDATK)/nvvm -Wl,-rpath,$(CUDATK)/nvvm/lib64 + +clean: + /bin/rm -f ptxgen ptxcc $(OBJ) ptxgrammar.hh ptxgrammar.cc ptx.cc ptxgrammar.output + +$(OBJ): ptxgrammar.cc ptx.cc PTXParser.h PTXLexer.h + diff --git a/ptxtools/PTXLexer.h b/ptxtools/PTXLexer.h new file mode 100644 index 00000000..1136fe4d --- /dev/null +++ b/ptxtools/PTXLexer.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +namespace parser +{ + class PTXLexer; + class PTXParser; +} + +#include "ptxgrammar.hh" + +namespace parser +{ + /*! \brief A wrapper around yyFlexLexer to allow for a local variable */ + class PTXLexer : public ptxFlexLexer + { + public: + YYSTYPE* yylval; + int column; + int nextColumn; + + public: + PTXLexer( std::istream* arg_yyin, + std::ostream* arg_yyout ) : + yyFlexLexer( arg_yyin, arg_yyout ), yylval( 0 ), column( 0 ), + nextColumn( 0 ) { } + + int yylex(); + int yylexPosition() + { + int token = yylex(); + column = nextColumn; + nextColumn = column + strlen( YYText() ); + return token; + } + + }; +} diff --git a/ptxtools/PTXParser.h b/ptxtools/PTXParser.h new file mode 100644 index 00000000..d221d739 --- /dev/null +++ b/ptxtools/PTXParser.h @@ -0,0 +1,254 @@ +#pragma once + +#undef yyFlexLexer +#define yyFlexLexer ptxFlexLexer +#include + + +#include "PTXLexer.h" + +#include +#include +#include +namespace ptx +{ + extern int yyparse( parser::PTXLexer&, parser::PTXParser& ); +} + +namespace parser +{ + /*! \brief An implementation of the Parser interface for PTX */ + class PTXParser + { + private: + typedef int token_t; + std::ostream &out; + std::string _identifier; + token_t _dataTypeId; + int _alignment; + + bool isArgumentList, isReturnArgumentList; + struct argument_t + { + token_t type; + std::string name; + int dim; + + argument_t(const token_t _type, const std::string &_name, const int _dim = 1) : + type(_type), name(_name), dim(_dim) {} + }; + std::vector argumentList, returnArgumentList; + std::vector arrayDimensionsList; + + public: + PTXParser(std::ostream &_out) : out(_out) + { + isArgumentList = isReturnArgumentList = false; + _alignment = 1; + } + + void printHeader() + { + std::stringstream s; +#if 0 + s << "template struct __align__(N) b8_t { unsigned char _v[N]; __device__ b8_t() {}; __device__ b8_t (const int value) {}}; \n"; + s << "template struct __align__(2*N) b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n"; +#else + s << "template struct b8_t { unsigned char _v[N]; __device__ b8_t() {}; __device__ b8_t (const int value) {}}; \n"; + s << "template struct b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n"; +#endif + s << "struct b8d_t { unsigned char _v[1]; }; \n"; + s << "struct b16d_t { unsigned short _v[1]; }; \n"; + + s << "typedef unsigned int b32_t; \n"; + s << "typedef unsigned int u32_t; \n"; + s << "typedef int s32_t; \n"; + + s << "typedef unsigned long long b64_t; \n"; + s << "typedef unsigned long long u64_t; \n"; + s << "typedef long long s64_t; \n"; + + s << "typedef float f32_t; \n"; + s << "typedef double f64_t; \n"; + s << " \n"; + out << s.str(); + } + +#define LOC YYLTYPE& location + + void identifier(const std::string &s) { _identifier = s; } + void dataTypeId(const token_t token) { _dataTypeId = token; } + void argumentListBegin(LOC) { isArgumentList = true; } + void argumentListEnd (LOC) { isArgumentList = false; } + void returnArgumentListBegin(LOC) { isReturnArgumentList = true; } + void returnArgumentListEnd (LOC) { isReturnArgumentList = false; } + void argumentDeclaration(LOC) + { + assert(arrayDimensionsList.size() <= 1); + const int dim = arrayDimensionsList.empty() ? 1 : arrayDimensionsList[0]; + const argument_t arg(_dataTypeId, _identifier, dim); + if (isArgumentList) + argumentList.push_back(arg); + else if (isReturnArgumentList) + returnArgumentList.push_back(arg); + else + assert(0); + arrayDimensionsList.clear(); + } + void alignment(const int value) { _alignment = value; } + + void arrayDimensions(const int value) + { + arrayDimensionsList.push_back(value); + } + + std::string printArgument(const argument_t arg, const bool printDataType = true) + { + std::stringstream s; + if (printDataType) + s << tokenToDataType(arg.type, arg.dim) << " "; + s << arg.name << " "; + return s.str(); + } + + std::string printArgumentList(const bool printDataType = true) + { + std::stringstream s; + if (argumentList.empty()) return s.str(); + const int n = argumentList.size(); + s << " " << printArgument(argumentList[0], printDataType); + for (int i = 1; i < n; i++) + s << ",\n " << printArgument(argumentList[i], printDataType); + return s.str(); + } + + void visibleEntryDeclaration(const std::string &calleeName, LOC) + { + std::stringstream s; + assert(returnArgumentList.empty()); + s << "extern \"C\" \n"; + s << "__global__ void " << calleeName << " (\n"; + s << printArgumentList(); + s << "\n ) { asm(\" // entry \"); }\n"; + + + /* check if this is an "export" entry */ + const int entryNameLength = calleeName.length(); + const int hostNameLength = std::max(entryNameLength-9,0); + const std::string ___export(&calleeName.c_str()[hostNameLength]); + if (___export.compare("___export") == 0) + { + std::string hostCalleeName; + hostCalleeName.append(calleeName.c_str(), hostNameLength); + s << "/*** host interface ***/\n"; + s << "extern \"C\" \n"; + s << "__host__ void " << hostCalleeName << " (\n"; + s << printArgumentList(); + s << "\n )\n"; + s << "{\n "; +// s << " cudaFuncSetCacheConfig (" << calleeName << ", "; + s << " cudaDeviceSetCacheConfig ("; +#if 1 + s << " cudaFuncCachePreferEqual "; +#elif 1 + s << " cudaFuncCachePreferL1 "; +#else + s << " cudaFuncCachePreferShared "; +#endif + s << ");\n"; + s << calleeName; + s << "<<<1,32>>>(\n"; + s << printArgumentList(false); + s << ");\n"; + s << " cudaDeviceSynchronize(); \n"; + s << "}\n"; + } + s << "\n"; + argumentList.clear(); + + out << s.str(); + } + + void visibleFunctionDeclaration(const std::string &calleeName, LOC) + { + std::stringstream s; + assert(returnArgumentList.size() < 2); + s << "extern \"C\" \n"; + s << "__device__ "; + if (returnArgumentList.empty()) + s << " void "; + else + s << " " << tokenToDataType(returnArgumentList[0].type, returnArgumentList[0].dim); + s << calleeName << " (\n"; + s << printArgumentList(); + + if (returnArgumentList.empty()) + s << "\n ) { asm(\" // function \"); }\n\n"; + else + { + s << "\n ) { asm(\" // function \"); return 0;} /* return value to disable warnings */\n\n"; +// s << "\n ) { asm(\" // function \"); } /* this will generate warrning */\n\n"; + } + + argumentList.clear(); + returnArgumentList.clear(); + + out << s.str(); + } + + void visibleInitializableDeclaration(const std::string &name, LOC) + { + assert(arrayDimensionsList.size() == 1); + std::stringstream s; + s << "extern \"C\" __device__ "; + if (_alignment > 0) + s << "__attribute__((aligned(" << _alignment << "))) "; + s << tokenToDataType(_dataTypeId, 0); + if (arrayDimensionsList[0] == 0) + s << name << ";\n\n"; + else + s << name << "[" << arrayDimensionsList[0] << "] = {0};\n\n"; + out << s.str(); + arrayDimensionsList.clear(); + } + +#undef LOC + + std::string tokenToDataType( token_t token , int dim) + { + std::stringstream s; + switch( token ) + { + case TOKEN_B8: + if (dim > 0) s << "b8_t<"< "; + else s << "b8d_t "; + break; + case TOKEN_U8: assert(0); s << "u8_t "; break; + case TOKEN_S8: assert(0); s << "s8_t "; break; + + case TOKEN_B16: + if (dim > 0) s << "b16_t<"< "; + else s << "b16d_t "; + break; + case TOKEN_U16: assert(0); s << "u16_t "; break; + case TOKEN_S16: assert(0); s << "s16_t "; break; + + case TOKEN_B32: assert(dim <= 1); s << "b32_t "; break; + case TOKEN_U32: assert(dim <= 1); s << "u32_t "; break; + case TOKEN_S32: assert(dim <= 1); s << "s32_t "; break; + + case TOKEN_B64: assert(dim <= 1); s << "b64_t "; break; + case TOKEN_U64: assert(dim <= 1); s << "u64_t "; break; + case TOKEN_S64: assert(dim <= 1); s << "s64_t "; break; + + case TOKEN_F32: assert(dim <= 1); s << "f32_t "; break; + case TOKEN_F64: assert(dim <= 1); s << "f64_t "; break; + default: std::cerr << "token= " << token<< std::endl; assert(0); + } + + return s.str(); + } + }; +} + + diff --git a/ptxtools/alloy_ptxcc.sh b/ptxtools/alloy_ptxcc.sh new file mode 100755 index 00000000..d98f8658 --- /dev/null +++ b/ptxtools/alloy_ptxcc.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +PTXCC=$ISPC_HOME/ptxtools/ptxcc +PTXGEN=$ISPC_HOME/ptxtools/ptxgen +ARGS=${@:2} +if [ "$NVVM" == "1" ]; +then + LLVM32=$HOME/usr/local/llvm/bin-3.2 + LLVMDIS=$LLVM32/bin/llvm-dis + $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll -o $1.ptx) && \ + $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \ + $(nvcc test_static_nvptx.cpp examples/nvcc_helpers.cu examples/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples/ -D_CUDA_ -lcudadevrt $ARGS); +else + $($PTXCC $1 -o $1.o -Xnvcc="-G") && \ + $(nvcc test_static_nvptx.cpp examples/nvcc_helpers.cu examples/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples/ -D_CUDA_ -lcudadevrt $ARGS); +fi + + + diff --git a/ptxtools/main.cpp b/ptxtools/main.cpp new file mode 100644 index 00000000..f5de981d --- /dev/null +++ b/ptxtools/main.cpp @@ -0,0 +1,1458 @@ +/* + +Bonsai V2: A parallel GPU N-body gravitational Tree-code + +(c) 2010-2012: +Jeroen Bedorf +Evghenii Gaburov +Simon Portegies Zwart + +Leiden Observatory, Leiden University + +http://castle.strw.leidenuniv.nl +http://github.com/treecode/Bonsai + +*/ + +#ifdef WIN32 +#define WIN32_LEAN_AND_MEAN +#define NOMINMAX +#include +#include +#define M_PI 3.14159265358979323846264338328 + +#include +#include +void srand48(const long seed) +{ + srand(seed); +} +//JB This is not a proper work around but just to get things compiled... +double drand48() +{ + return double(rand())/RAND_MAX; +} + + +#endif + + +#ifdef USE_MPI + #include + #include +#endif + +#include +#include +#include +#include +#include +#include "log.h" +#include "anyoption.h" +#include "renderloop.h" +#include "plummer.h" +#include "disk_shuffle.h" +#ifdef GALACTICS +#include "galactics.h" +#endif + + +#if ENABLE_LOG + bool ENABLE_RUNTIME_LOG; + bool PREPEND_RANK; + int PREPEND_RANK_PROCID; + int PREPEND_RANK_NPROCS; +#endif + +using namespace std; + +#include "../profiling/bonsai_timing.h" + +int devID; +int renderDevID; + +extern void initTimers() +{ +#ifndef CUXTIMER_DISABLE + // Set up the profiling timing info + build_tree_init(); + compute_propertiesD_init(); + dev_approximate_gravity_init(); + parallel_init(); + sortKernels_init(); + timestep_init(); +#endif +} + +extern void displayTimers() +{ +#ifndef CUXTIMER_DISABLE + // Display all timing info on the way out + build_tree_display(); + compute_propertiesD_display(); + //dev_approximate_gravity_display(); + //parallel_display(); + //sortKernels_display(); + //timestep_display(); +#endif +} + +#include "octree.h" + +#ifdef USE_OPENGL +#include "renderloop.h" +#include +#endif + +void read_dumbp_file_parallel(vector &bodyPositions, vector &bodyVelocities, vector &bodiesIDs, float eps2, + string fileName, int rank, int procs, int &NTotal2, int &NFirst, int &NSecond, int &NThird, octree *tree, int reduce_bodies_factor) +{ + //Process 0 does the file reading and sends the data + //to the other processes + + //Now we have different types of files, try to determine which one is used + /***** + If individual softening is on there is only one option: + Header is formatted as follows: + N # # # + so read the first number and compute how particles should be distributed + + If individual softening is NOT enabled, i can be anything, but for ease I assume standard dumbp files: + no Header + ID mass x y z vx vy vz + now the next step is risky, we assume mass adds up to 1, so number of particles will be : 1 / mass + use this as initial particle distribution + + */ + + + char fullFileName[256]; + sprintf(fullFileName, "%s", fileName.c_str()); + + LOG("Trying to read file: %s \n", fullFileName); + + ifstream inputFile(fullFileName, ios::in); + + if(!inputFile.is_open()) + { + LOG("Can't open input file \n"); + exit(0); + } + + int NTotal; + int idummy; + real4 positions; + real4 velocity; + + #ifndef INDSOFT + inputFile >> idummy >> positions.w; + inputFile.seekg(0, ios::beg); //Reset file pointer + NTotal = (int)(1 / positions.w); + #else + //Read the Ntotal from the file header + inputFile >> NTotal >> NFirst >> NSecond >> NThird; + #endif + + + + //Rough divide + uint perProc = NTotal / procs; + bodyPositions.reserve(perProc+10); + bodyVelocities.reserve(perProc+10); + bodiesIDs.reserve(perProc+10); + perProc -= 1; + + //Start reading + int particleCount = 0; + int procCntr = 1; + + int globalParticleCount = 0; + + while(!inputFile.eof()) { + + inputFile >> idummy + >> positions.w >> positions.x >> positions.y >> positions.z + >> velocity.x >> velocity.y >> velocity.z; + + globalParticleCount++; + + if( globalParticleCount % reduce_bodies_factor == 0 ) + positions.w *= reduce_bodies_factor; + + if( globalParticleCount % reduce_bodies_factor != 0 ) + continue; + + #ifndef INDSOFT + velocity.w = sqrt(eps2); + #else + inputFile >> velocity.w; //Read the softening from the input file + #endif + + bodyPositions.push_back(positions); + bodyVelocities.push_back(velocity); + + #ifndef INDSOFT + idummy = particleCount; + #endif + + bodiesIDs.push_back(idummy); + + particleCount++; + + + if(bodyPositions.size() > perProc && procCntr != procs) + { + tree->ICSend(procCntr, &bodyPositions[0], &bodyVelocities[0], &bodiesIDs[0], (int)bodyPositions.size()); + procCntr++; + + bodyPositions.clear(); + bodyVelocities.clear(); + bodiesIDs.clear(); + } + }//end while + + inputFile.close(); + + //Clear the last one since its double + bodyPositions.resize(bodyPositions.size()-1); + NTotal2 = particleCount-1; + + LOGF(stderr, "NTotal: %d\tper proc: %d\tFor ourself: %d \n", NTotal, perProc, (int)bodiesIDs.size()); +} + +void read_tipsy_file_parallel(vector &bodyPositions, vector &bodyVelocities, + vector &bodiesIDs, float eps2, string fileName, + int rank, int procs, int &NTotal2, int &NFirst, + int &NSecond, int &NThird, octree *tree, + vector &dustPositions, vector &dustVelocities, + vector &dustIDs, int reduce_bodies_factor, + int reduce_dust_factor) +{ + //Process 0 does the file reading and sends the data + //to the other processes + /* + + Read in our custom version of the tipsy file format. + Most important change is that we store particle id on the + location where previously the potential was stored. + */ + + + char fullFileName[256]; + sprintf(fullFileName, "%s", fileName.c_str()); + + LOG("Trying to read file: %s \n", fullFileName); + + + + ifstream inputFile(fullFileName, ios::in | ios::binary); + if(!inputFile.is_open()) + { + LOG("Can't open input file \n"); + exit(0); + } + + dump h; + inputFile.read((char*)&h, sizeof(h)); + + int NTotal; + int idummy; + real4 positions; + real4 velocity; + + + //Read tipsy header + NTotal = h.nbodies; + NFirst = h.ndark; + NSecond = h.nstar; + NThird = h.nsph; + + tree->set_t_current((float) h.time); + + //Rough divide + uint perProc = (NTotal / procs) /reduce_bodies_factor; + bodyPositions.reserve(perProc+10); + bodyVelocities.reserve(perProc+10); + bodiesIDs.reserve(perProc+10); + perProc -= 1; + + //Start reading + int particleCount = 0; + int procCntr = 1; + + dark_particle d; + star_particle s; + + int globalParticleCount = 0; + int bodyCount = 0; + int dustCount = 0; + + for(int i=0; i < NTotal; i++) + { + if(i < NFirst) + { + inputFile.read((char*)&d, sizeof(d)); + velocity.w = d.eps; + positions.w = d.mass; + positions.x = d.pos[0]; + positions.y = d.pos[1]; + positions.z = d.pos[2]; + velocity.x = d.vel[0]; + velocity.y = d.vel[1]; + velocity.z = d.vel[2]; + idummy = d.phi; + } + else + { + inputFile.read((char*)&s, sizeof(s)); + velocity.w = s.eps; + positions.w = s.mass; + positions.x = s.pos[0]; + positions.y = s.pos[1]; + positions.z = s.pos[2]; + velocity.x = s.vel[0]; + velocity.y = s.vel[1]; + velocity.z = s.vel[2]; + idummy = s.phi; + } + + + if(positions.z < -10e10) + { + fprintf(stderr," Removing particle %d because of Z is: %f \n", globalParticleCount, positions.z); + continue; + } + + + globalParticleCount++; + + #ifdef USE_DUST + if(idummy >= 50000000 && idummy < 100000000) + { + dustCount++; + if( dustCount % reduce_dust_factor == 0 ) + positions.w *= reduce_dust_factor; + + if( dustCount % reduce_dust_factor != 0 ) + continue; + dustPositions.push_back(positions); + dustVelocities.push_back(velocity); + dustIDs.push_back(idummy); + } + else + { + bodyCount++; + if( bodyCount % reduce_bodies_factor == 0 ) + positions.w *= reduce_bodies_factor; + + if( bodyCount % reduce_bodies_factor != 0 ) + continue; + bodyPositions.push_back(positions); + bodyVelocities.push_back(velocity); + bodiesIDs.push_back(idummy); + } + + + #else + if( globalParticleCount % reduce_bodies_factor == 0 ) + positions.w *= reduce_bodies_factor; + + if( globalParticleCount % reduce_bodies_factor != 0 ) + continue; + bodyPositions.push_back(positions); + bodyVelocities.push_back(velocity); + bodiesIDs.push_back(idummy); + #endif + + particleCount++; + + + if(bodyPositions.size() > perProc && procCntr != procs) + { + tree->ICSend(procCntr, &bodyPositions[0], &bodyVelocities[0], &bodiesIDs[0], (int)bodyPositions.size()); + procCntr++; + + bodyPositions.clear(); + bodyVelocities.clear(); + bodiesIDs.clear(); + } + }//end while + + inputFile.close(); + + //Clear the last one since its double +// bodyPositions.resize(bodyPositions.size()-1); +// NTotal2 = particleCount-1; + NTotal2 = particleCount; + LOGF(stderr,"NTotal: %d\tper proc: %d\tFor ourself: %d \tNDust: %d \n", + NTotal, perProc, (int)bodiesIDs.size(), (int)dustPositions.size()); +} + + +void read_generate_cube(vector &bodyPositions, vector &bodyVelocities, + vector &bodiesIDs, float eps2, string fileName, + int rank, int procs, int &NTotal2, int &NFirst, + int &NSecond, int &NThird, octree *tree, + vector &dustPositions, vector &dustVelocities, + vector &dustIDs, int reduce_bodies_factor, + int reduce_dust_factor) +{ + //Process 0 does the file reading and sends the data + //to the other processes + /* + + Read in our custom version of the tipsy file format. + Most important change is that we store particle id on the + location where previously the potential was stored. + */ + + + int NTotal; + int idummy; + real4 positions; + real4 velocity; + + + //Read tipsy header + NTotal = (int)std::pow(2.0, 22); + NFirst = NTotal; + NSecond = 0; + NThird = 0; + + fprintf(stderr,"Going to generate a random cube , number of particles: %d \n", NTotal); + + tree->set_t_current((float) 0); + + //Rough divide + uint perProc = NTotal / procs; + bodyPositions.reserve(perProc+10); + bodyVelocities.reserve(perProc+10); + bodiesIDs.reserve(perProc+10); + perProc -= 1; + + //Start reading + int particleCount = 0; + int procCntr = 1; + + + int globalParticleCount = 0; + + float mass = 1.0 / NTotal; + + for(int i=0; i < NTotal; i++) + { + velocity.w = 0; + positions.w = mass; + positions.x = drand48(); + positions.y = drand48(); + positions.z = drand48(); + velocity.x = 0.001*drand48(); + velocity.y = 0.001*drand48(); + velocity.z = 0.001*drand48(); + + globalParticleCount++; + bodyPositions.push_back(positions); + bodyVelocities.push_back(velocity); + bodiesIDs.push_back(globalParticleCount); + + if(bodyPositions.size() > perProc && procCntr != procs) + { + tree->ICSend(procCntr, &bodyPositions[0], &bodyVelocities[0], &bodiesIDs[0], (int)bodyPositions.size()); + procCntr++; + + bodyPositions.clear(); + bodyVelocities.clear(); + bodiesIDs.clear(); + } + }//end while + + //Clear the last one since its double +// bodyPositions.resize(bodyPositions.size()-1); +// NTotal2 = particleCount-1; + NTotal2 = NTotal; + LOGF(stderr,"NTotal: %d\tper proc: %d\tFor ourself: %d \tNDust: %d \n", + NTotal, perProc, (int)bodiesIDs.size(), (int)dustPositions.size()); +} + + +double rot[3][3]; + +void rotmat(double i,double w) +{ + rot[0][0] = cos(w); + rot[0][1] = -cos(i)*sin(w); + rot[0][2] = -sin(i)*sin(w); + rot[1][0] = sin(w); + rot[1][1] = cos(i)*cos(w); + rot[1][2] = sin(i)*cos(w); + rot[2][0] = 0.0; + rot[2][1] = -sin(i); + rot[2][2] = cos(i); + fprintf(stderr,"%g %g %g\n",rot[0][0], rot[0][1], rot[0][2]); + fprintf(stderr,"%g %g %g\n",rot[1][0], rot[1][1], rot[1][2]); + fprintf(stderr,"%g %g %g\n",rot[2][0], rot[2][1], rot[2][2]); +} + +void rotate(double rot[3][3],float *vin) +{ + static double vout[3]; + + for(int i=0; i<3; i++) { + vout[i] = 0; + for(int j=0; j<3; j++) + vout[i] += rot[i][j] * vin[j]; + /* Remember the rotation matrix is the transpose of rot */ + } + for(int i=0; i<3; i++) + vin[i] = (float) vout[i]; +} + +void euler(vector &bodyPositions, + vector &bodyVelocities, + double inc, double omega) +{ + rotmat(inc,omega); + size_t nobj = bodyPositions.size(); + for(uint i=0; i < nobj; i++) + { + float r[3], v[3]; + r[0] = bodyPositions[i].x; + r[1] = bodyPositions[i].y; + r[2] = bodyPositions[i].z; + v[0] = bodyVelocities[i].x; + v[1] = bodyVelocities[i].y; + v[2] = bodyVelocities[i].z; + + rotate(rot,r); + rotate(rot,v); + + bodyPositions[i].x = r[0]; + bodyPositions[i].y = r[1]; + bodyPositions[i].z = r[2]; + bodyVelocities[i].x = v[0]; + bodyVelocities[i].y = v[1]; + bodyVelocities[i].z = v[2]; + } +} + + + +double centerGalaxy(vector &bodyPositions, + vector &bodyVelocities) +{ + size_t nobj = bodyPositions.size(); + float xc, yc, zc, vxc, vyc, vzc, mtot; + + + mtot = 0; + xc = yc = zc = vxc = vyc = vzc = 0; + for(uint i=0; i< nobj; i++) { + xc += bodyPositions[i].w*bodyPositions[i].x; + yc += bodyPositions[i].w*bodyPositions[i].y; + zc += bodyPositions[i].w*bodyPositions[i].z; + vxc += bodyPositions[i].w*bodyVelocities[i].x; + vyc += bodyPositions[i].w*bodyVelocities[i].y; + vzc += bodyPositions[i].w*bodyVelocities[i].z; + mtot += bodyPositions[i].w; + } + xc /= mtot; + yc /= mtot; + zc /= mtot; + vxc /= mtot; + vyc /= mtot; + vzc /= mtot; + for(uint i=0; i< nobj; i++) + { + bodyPositions[i].x -= xc; + bodyPositions[i].y -= yc; + bodyPositions[i].z -= zc; + bodyVelocities[i].x -= vxc; + bodyVelocities[i].y -= vyc; + bodyVelocities[i].z -= vzc; + } + + return mtot; +} + + + + +int setupMergerModel(vector &bodyPositions1, + vector &bodyVelocities1, + vector &bodyIDs1, + vector &bodyPositions2, + vector &bodyVelocities2, + vector &bodyIDs2){ + uint i; + double ds=1.0, vs, ms=1.0; + double mu1, mu2, vp; + double b=1.0, rsep=10.0; + double x, y, vx, vy, x1, y1, vx1, vy1 , x2, y2, vx2, vy2; + double theta, tcoll; + double inc1=0, omega1=0; + double inc2=0, omega2=0; + + + ds = 1.52; + ms = 1.0; + b = 10; + rsep = 168; + inc1 = 0; + omega1 = 0; + inc2 = 180; + omega2 = 0; + + + if(ds < 0) + { + cout << "Enter size ratio (for gal2): "; + cin >> ds; + cout << "Enter mass ratio (for gal2): "; + cin >> ms; + cout << "Enter relative impact parameter: "; + cin >> b; + + cout << "Enter initial separation: "; + cin >> rsep; + cout << "Enter Euler angles for first galaxy:\n"; + cout << "Enter inclination: "; + cin >> inc1; + cout << "Enter omega: "; + cin >> omega1; + cout << "Enter Euler angles for second galaxy:\n"; + cout << "Enter inclination: "; + cin >> inc2; + cout << "Enter omega: "; + cin >> omega2; + } + + + double inc1_inp, inc2_inp, om2_inp, om1_inp; + + inc1_inp = inc1; + inc2_inp = inc2; + om1_inp = omega1; + om2_inp = omega1; + + + inc1 *= M_PI/180.; + inc2 *= M_PI/180.; + omega1 *= M_PI/180.; + omega2 *= M_PI/180.; + omega1 += M_PI; + + fprintf(stderr,"Size ratio: %f Mass ratio: %f \n", ds, ms); + fprintf(stderr,"Relative impact par: %f Initial sep: %f \n", b, rsep); + fprintf(stderr,"Euler angles first: %f %f Second: %f %f \n", + inc1, omega1,inc2,omega2); + + vs = sqrt(ms/ds); /* adjustment for internal velocities */ + + + //Center everything in galaxy 1 and galaxy 2 + double galaxyMass1 = centerGalaxy(bodyPositions1, bodyVelocities1); + double galaxyMass2 = centerGalaxy(bodyPositions2, bodyVelocities2); + + + galaxyMass2 = ms*galaxyMass2; //Adjust total mass + + mu1 = galaxyMass2/(galaxyMass1 + galaxyMass2); + mu2 = -galaxyMass1/(galaxyMass1 + galaxyMass2); + + double m1 = galaxyMass1; + double m2 = galaxyMass2; + + + /* Relative Parabolic orbit - anti-clockwise */ + if( b > 0 ) { + vp = sqrt(2.0*(m1 + m2)/b); + x = 2*b - rsep; y = -2*sqrt(b*(rsep-b)); + vx = sqrt(b*(rsep-b))*vp/rsep; vy = b*vp/rsep; + } + else { + b = 0; + x = - rsep; y = 0.0; + vx = sqrt(2.0*(m1 + m2)/rsep); vy = 0.0; + } + + /* Calculate collison time */ + if( b > 0 ) { + theta = atan2(y,x); + tcoll = (0.5*tan(0.5*theta) + pow(tan(0.5*theta),3.0)/6.)*4*b/vp; + fprintf(stderr,"Collision time is t=%g\n",tcoll); + } + else { + tcoll = -pow(rsep,1.5)/(1.5*sqrt(2.0*(m1+m2))); + fprintf(stderr,"Collision time is t=%g\n",tcoll); + } + + /* These are the orbital adjustments for a parabolic encounter */ + /* Change to centre of mass frame */ + x1 = mu1*x; x2 = mu2*x; + y1 = mu1*y; y2 = mu2*y; + vx1 = mu1*vx; vx2 = mu2*vx; + vy1 = mu1*vy; vy2 = mu2*vy; + + + /* Rotate the galaxies */ + euler(bodyPositions1, bodyVelocities1, inc1,omega1); + euler(bodyPositions2, bodyVelocities2, inc2,omega2); + + for(i=0; i< bodyPositions1.size(); i++) { + bodyPositions1[i].x = (float) (bodyPositions1[i].x + x1); + bodyPositions1[i].y = (float) (bodyPositions1[i].y + y1); + bodyVelocities1[i].x = (float) (bodyVelocities1[i].x + vx1); + bodyVelocities1[i].y = (float) (bodyVelocities1[i].y + vy1); + } + /* Rescale and reset the second galaxy */ + for(i=0; i< bodyPositions2.size(); i++) { + bodyPositions2[i].w = (float) ms*bodyPositions2[i].w; + bodyPositions2[i].x = (float) (ds*bodyPositions2[i].x + x2); + bodyPositions2[i].y = (float) (ds*bodyPositions2[i].y + y2); + bodyPositions2[i].z = (float) ds*bodyPositions2[i].z; + bodyVelocities2[i].x = (float) (vs*bodyVelocities2[i].x + vx2); + bodyVelocities2[i].y = (float) (vs*bodyVelocities2[i].y + vy2); + bodyVelocities2[i].z = (float) vs*bodyVelocities2[i].z; + } + + + //Put them into one + bodyPositions1.insert(bodyPositions1.end(), bodyPositions2.begin(), bodyPositions2.end()); + bodyVelocities1.insert(bodyVelocities1.end(), bodyVelocities2.begin(), bodyVelocities2.end()); + bodyIDs1.insert(bodyIDs1.end(), bodyIDs2.begin(), bodyIDs2.end()); + + + return 0; +} + + +long long my_dev::base_mem::currentMemUsage; +long long my_dev::base_mem::maxMemUsage; + +int main(int argc, char** argv) +{ + my_dev::base_mem::currentMemUsage = 0; + my_dev::base_mem::maxMemUsage = 0; + + vector bodyPositions; + vector bodyVelocities; + vector bodyIDs; + + vector dustPositions; + vector dustVelocities; + vector dustIDs; + + + float eps = 0.05f; + float theta = 0.75f; + float timeStep = 1.0f / 16.0f; + float tEnd = 1; + int iterEnd = (1 << 30); + devID = 0; + renderDevID = 0; + + string fileName = ""; + string logFileName = "gpuLog.log"; + string snapshotFile = "snapshot_"; + float snapshotIter = -1; + float remoDistance = -1.0; + int snapShotAdd = 0; + int rebuild_tree_rate = 2; + int reduce_bodies_factor = 1; + int reduce_dust_factor = 1; + string fullScreenMode = ""; + bool direct = false; + bool fullscreen = false; + bool displayFPS = false; + bool diskmode = false; + bool stereo = false; + +#if ENABLE_LOG + ENABLE_RUNTIME_LOG = false; + PREPEND_RANK = false; +#endif + +#ifdef USE_OPENGL + TstartGlow = 0.0; + dTstartGlow = 1.0; +#endif + + int nPlummer = -1; + int nSphere = -1; + int nMilkyWay = -1; + int nMWfork = 4; + /************** beg - command line arguments ********/ +#if 1 + { + AnyOption opt; + +#define ADDUSAGE(line) {{std::stringstream oss; oss << line; opt.addUsage(oss.str());}} + + ADDUSAGE(" "); + ADDUSAGE("Usage"); + ADDUSAGE(" "); + ADDUSAGE(" -h --help Prints this help "); + ADDUSAGE(" -i --infile # Input snapshot filename "); + ADDUSAGE(" --logfile # Log filename [" << logFileName << "]"); + ADDUSAGE(" --dev # Device ID [" << devID << "]"); + ADDUSAGE(" --renderdev # Rendering Device ID [" << renderDevID << "]"); + ADDUSAGE(" -t --dt # time step [" << timeStep << "]"); + ADDUSAGE(" -T --tend # N-body end time [" << tEnd << "]"); + ADDUSAGE(" -I --iend # N-body end iteration [" << iterEnd << "]"); + ADDUSAGE(" -e --eps # softening (will be squared) [" << eps << "]"); + ADDUSAGE(" -o --theta # opening angle (theta) [" <get_time(); + + //Get parallel processing information + int procId = tree->mpiGetRank(); + int nProcs = tree->mpiGetNProcs(); + + if (procId == 0) + { + //NOte cant use LOGF here since MPI isnt initialized yet + cerr << "[INIT]\tUsed settings: \n"; + cerr << "[INIT]\tInput filename " << fileName << endl; + cerr << "[INIT]\tLog filename " << logFileName << endl; + cerr << "[INIT]\tTheta: \t\t" << theta << "\t\teps: \t\t" << eps << endl; + cerr << "[INIT]\tTimestep: \t" << timeStep << "\t\ttEnd: \t\t" << tEnd << endl; + cerr << "[INIT]\titerEnd: \t" << iterEnd << endl; + cerr << "[INIT]\tsnapshotFile: \t" << snapshotFile << "\tsnapshotIter: \t" << snapshotIter << endl; + cerr << "[INIT]\tInput file: \t" << fileName << "\t\tdevID: \t\t" << devID << endl; + cerr << "[INIT]\tRemove dist: \t" << remoDistance << endl; + cerr << "[INIT]\tSnapshot Addition: \t" << snapShotAdd << endl; + cerr << "[INIT]\tRebuild tree every " << rebuild_tree_rate << " timestep\n"; + + + if( reduce_bodies_factor > 1 ) + cout << "[INIT]\tReduce number of non-dust bodies by " << reduce_bodies_factor << " \n"; + if( reduce_dust_factor > 1 ) + cout << "[INIT]\tReduce number of dust bodies by " << reduce_dust_factor << " \n"; + +#if ENABLE_LOG + if (ENABLE_RUNTIME_LOG) + cerr << "[INIT]\tRuntime logging is ENABLED \n"; + else + cerr << "[INIT]\tRuntime logging is DISABLED \n"; +#endif + cerr << "[INIT]\tDirect gravitation is " << (direct ? "ENABLED" : "DISABLED") << endl; +#if USE_OPENGL + cerr << "[INIT]\tTglow = " << TstartGlow << endl; + cerr << "[INIT]\tdTglow = " << dTstartGlow << endl; + cerr << "[INIT]\tstereo = " << stereo << endl; +#endif +#ifdef USE_MPI + cerr << "[INIT]\tCode is built WITH MPI Support \n"; +#else + cerr << "[INIT]\tCode is built WITHOUT MPI Support \n"; +#endif + } + +#ifdef USE_MPI +#if 1 + omp_set_num_threads(16); +#pragma omp parallel + { + int tid = omp_get_thread_num(); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + pthread_getaffinity_np(pthread_self() , sizeof( cpu_set_t ), &cpuset ); + + + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + + int i, set=-1; + for (i = 0; i < CPU_SETSIZE; i++) + if (CPU_ISSET(i, &cpuset)) + set = i; + // fprintf(stderr,"[Proc: %d ] Thread %d bound to: %d Total cores: %d\n", + // procId, tid, set, num_cores); + } +#endif + + + + +#if 0 + omp_set_num_threads(4); + //default + // int cpulist[] = {0,1,2,3,8,9,10,11}; + int cpulist[] = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15}; //HA-PACS + //int cpulist[] = {0,1,2,3,4,5,6,7}; + //int cpulist[] = {0,2,4,6, 8,10,12,14}; + //int cpulist[] = {1,3,5,7, 9,11,13,15}; + //int cpulist[] = {1,9,5,11, 3,7,13,15}; + //int cpulist[] = {1,15,3,13, 2,4,6,8}; + //int cpulist[] = {1,1,1,1, 1,1,1,1}; + + +#pragma omp parallel + { + int tid = omp_get_thread_num(); + //int core_id = procId*4+tid; + int core_id = (procId%4)*4+tid; + core_id = cpulist[core_id]; + + int num_cores = sysconf(_SC_NPROCESSORS_ONLN); + // if (core_id >= num_cores) + + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(core_id, &cpuset); + pthread_t current_thread = pthread_self(); + int return_val = pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset); + + CPU_ZERO(&cpuset); + pthread_getaffinity_np(pthread_self() , sizeof( cpu_set_t ), &cpuset ); + + int i, set=-1; + for (i = 0; i < CPU_SETSIZE; i++) + if (CPU_ISSET(i, &cpuset)) + set = i; + //printf("CPU2: CPU %d\n", i); + + + fprintf(stderr,"Binding thread: %d of rank: %d to cpu: %d CHECK: %d Total cores: %d\n", + tid, procId, core_id, set, num_cores); + + } +#endif +#endif + + + + + +#if ENABLE_LOG +#ifdef USE_MPI + PREPEND_RANK_PROCID = procId; + PREPEND_RANK_NPROCS = nProcs; +#endif +#endif + + + if(nProcs > 1) + { + logFileName.append("-"); + + char buff[16]; + sprintf(buff,"%d-%d", nProcs, procId); + logFileName.append(buff); + } + + ofstream logFile(logFileName.c_str()); + + tree->set_context(logFile, false); //Do logging to file and enable timing (false = enabled) + + if (nPlummer == -1 && nSphere == -1 && !diskmode && nMilkyWay == -1) + { + if(procId == 0) + { +#ifdef TIPSYOUTPUT + read_tipsy_file_parallel(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, + procId, nProcs, NTotal, NFirst, NSecond, NThird, tree, + dustPositions, dustVelocities, dustIDs, reduce_bodies_factor, reduce_dust_factor); + + // read_generate_cube(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, + // procId, nProcs, NTotal, NFirst, NSecond, NThird, tree, + // dustPositions, dustVelocities, dustIDs, reduce_bodies_factor, reduce_dust_factor); + +#else + read_dumbp_file_parallel(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, procId, nProcs, NTotal, NFirst, NSecond, NThird, tree, reduce_bodies_factor); +#endif + } + else + { + tree->ICRecv(0, bodyPositions, bodyVelocities, bodyIDs); + } + } + else if(nMilkyWay >= 0) + { +#ifdef GALACTICS + if (procId == 0) printf("Using MilkyWay model with n= %d per proc, forked %d times \n", nMilkyWay, nMWfork); + assert(nMilkyWay > 0); + assert(nMWfork > 0); + + +#if 0 /* in this setup all particles will be of equal mass (exact number are galactic-depednant) */ + const float fdisk = 15.1; + const float fbulge = 5.1; + const float fhalo = 242.31; +#else /* here, bulge & mw particles have the same mass, but halo particles is 32x heavier */ + const float fdisk = 15.1; + const float fbulge = 5.1; + const float fhalo = 7.5; +#endif + + const float fsum = fdisk + fhalo + fbulge; + + const int ndisk = (int)(nMilkyWay * fdisk/fsum); + const int nbulge = (int)(nMilkyWay * fbulge/fsum); + const int nhalo = (int)(nMilkyWay * fhalo/fsum); + + assert(ndisk > 0); + assert(nbulge > 0); + assert(nhalo > 0); + + const double t0 = tree->get_time(); + const Galactics g(procId, nProcs, ndisk, nbulge, nhalo, nMWfork); + const double dt = tree->get_time() - t0; + if (procId == 0) + printf(" ndisk= %d nbulge= %d nhalo= %d :: ntotal= %d in %g sec\n", + g.get_ndisk(), g.get_nbulge(), g.get_nhalo(), g.get_ntot(), dt); + + const int ntot = g.get_ntot(); + bodyPositions.resize(ntot); + bodyVelocities.resize(ntot); + bodyIDs.resize(ntot); + for (int i= 0; i < ntot; i++) + { + assert(!std::isnan(g[i].x)); + assert(!std::isnan(g[i].y)); + assert(!std::isnan(g[i].z)); + assert(g[i].mass > 0.0); + bodyIDs[i] = g[i].id; + + bodyPositions[i].x = g[i].x; + bodyPositions[i].y = g[i].y; + bodyPositions[i].z = g[i].z; + bodyPositions[i].w = g[i].mass * 1.0/(double)nProcs; + + assert(!std::isnan(g[i].vx)); + assert(!std::isnan(g[i].vy)); + assert(!std::isnan(g[i].vz)); + + bodyVelocities[i].x = g[i].vx; + bodyVelocities[i].y = g[i].vy; + bodyVelocities[i].z = g[i].vz; + bodyVelocities[i].w = 0.0; + } +#else + assert(0); +#endif + } + else if(nPlummer >= 0) + { + if (procId == 0) printf("Using plummer model with n= %d per proc \n", nPlummer); + assert(nPlummer > 0); + const int seed = 19810614 + procId; + const Plummer m(nPlummer, procId, seed); + bodyPositions.resize(nPlummer); + bodyVelocities.resize(nPlummer); + bodyIDs.resize(nPlummer); + for (int i= 0; i < nPlummer; i++) + { + + assert(!std::isnan(m.pos[i].x)); + assert(!std::isnan(m.pos[i].y)); + assert(!std::isnan(m.pos[i].z)); + assert(m.mass[i] > 0.0); + bodyIDs[i] = nPlummer*procId + i; + + bodyPositions[i].x = m.pos[i].x; + bodyPositions[i].y = m.pos[i].y; + bodyPositions[i].z = m.pos[i].z; + bodyPositions[i].w = m.mass[i] * 1.0/nProcs; + + bodyVelocities[i].x = m.vel[i].x; + bodyVelocities[i].y = m.vel[i].y; + bodyVelocities[i].z = m.vel[i].z; + bodyVelocities[i].w = 0; + } + } + else if (nSphere >= 0) + { + //Sphere + if (procId == 0) printf("Using Spherical model with n= %d per proc \n", nSphere); + assert(nSphere >= 0); + bodyPositions.resize(nSphere); + bodyVelocities.resize(nSphere); + bodyIDs.resize(nSphere); + + srand48(procId+19840501); + + /* generate uniform sphere */ + int np = 0; + while (np < nSphere) + { + const double x = 2.0*drand48()-1.0; + const double y = 2.0*drand48()-1.0; + const double z = 2.0*drand48()-1.0; + const double r2 = x*x+y*y+z*z; + if (r2 < 1) + { + bodyIDs[np] = nSphere*procId + np; + + bodyPositions[np].x = x; + bodyPositions[np].y = y; + bodyPositions[np].z = z; + bodyPositions[np].w = (1.0/nSphere) * 1.0/nProcs; + + bodyVelocities[np].x = 0; + bodyVelocities[np].y = 0; + bodyVelocities[np].z = 0; + bodyVelocities[np].w = 0; + np++; + }//if + }//while + }//else + else if (diskmode) + { + if (procId == 0) printf("Using diskmode with filename %s\n", fileName.c_str()); + const int seed = procId+19840501; + srand48(seed); + const DiskShuffle disk(fileName); + const int np = disk.get_ntot(); + bodyPositions.resize(np); + bodyVelocities.resize(np); + bodyIDs.resize(np); + for (int i= 0; i < np; i++) + { + bodyIDs[i] = np*procId + i; + + bodyPositions[i].x = disk.pos(i).x; + bodyPositions[i].y = disk.pos (i).y; + bodyPositions[i].z = disk.pos (i).z; + bodyPositions[i].w = disk.mass(i) * 1.0/nProcs; + + bodyVelocities[i].x = disk.vel(i).x; + bodyVelocities[i].y = disk.vel(i).y; + bodyVelocities[i].z = disk.vel(i).z; + bodyVelocities[i].w = 0; + } + } + else + assert(0); + + tree->mpiSync(); + + +#ifdef TIPSYOUTPUT + LOGF(stderr, " t_current = %g\n", tree->get_t_current()); +#endif + + + //#define SETUP_MERGER +#ifdef SETUP_MERGER + vector bodyPositions2; + vector bodyVelocities2; + vector bodyIDs2; + + bodyPositions2.insert(bodyPositions2.begin(), bodyPositions.begin(), bodyPositions.end()); + bodyVelocities2.insert(bodyVelocities2.begin(), bodyVelocities.begin(), bodyVelocities.end()); + bodyIDs2.insert(bodyIDs2.begin(), bodyIDs.begin(), bodyIDs.end()); + + + setupMergerModel(bodyPositions, bodyVelocities, bodyIDs, + bodyPositions2, bodyVelocities2, bodyIDs2); + + NTotal *= 2; + NFirst *= 2; + NSecond *= 2; + NThird *= 2; +#endif + + + //Set the properties of the data set, it only is really used by process 0, which does the + //actual file I/O + tree->setDataSetProperties(NTotal, NFirst, NSecond, NThird); + + if(procId == 0) + LOG("Dataset particle information: Ntotal: %d\tNFirst: %d\tNSecond: %d\tNThird: %d \n", + NTotal, NFirst, NSecond, NThird); + + + //Sanity check for standard plummer spheres + double mass = 0, totalMass; + for(unsigned int i=0; i < bodyPositions.size(); i++) + { + mass += bodyPositions[i].w; + } + + tree->load_kernels(); + +#ifdef USE_MPI + MPI_Reduce(&mass,&totalMass,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD); +#else + totalMass = mass; +#endif + + if(procId == 0) LOGF(stderr, "Combined Mass: %f \tNTotal: %d \n", totalMass, NTotal); + + LOG("Starting! Bootup time: %lg \n", tree->get_time()-tStartup); + + + double t0 = tree->get_time(); + + tree->localTree.setN((int)bodyPositions.size()); + tree->allocateParticleMemory(tree->localTree); + + //Load data onto the device + for(uint i=0; i < bodyPositions.size(); i++) + { + tree->localTree.bodies_pos[i] = bodyPositions[i]; + tree->localTree.bodies_vel[i] = bodyVelocities[i]; + tree->localTree.bodies_ids[i] = bodyIDs[i]; + + tree->localTree.bodies_Ppos[i] = bodyPositions[i]; + tree->localTree.bodies_Pvel[i] = bodyVelocities[i]; + tree->localTree.bodies_time[i] = make_float2(tree->get_t_current(), tree->get_t_current()); + } + + tree->localTree.bodies_time.h2d(); + tree->localTree.bodies_pos.h2d(); + tree->localTree.bodies_vel.h2d(); + tree->localTree.bodies_Ppos.h2d(); + tree->localTree.bodies_Pvel.h2d(); + tree->localTree.bodies_ids.h2d(); + + //fprintf(stderr,"Send data to device proc: %d \n", procId); + // tree->devContext.writeLogEvent("Send data to device\n"); + + + +#ifdef USE_MPI + //Use sampling particles, determine frequency + tree->mpiSumParticleCount(tree->localTree.n); //Determine initial frequency +#endif + + + //If required set the dust particles +#ifdef USE_DUST + if( (int)dustPositions.size() > 0) + { + LOGF(stderr, "Allocating dust properties for %d dust particles \n", + (int)dustPositions.size()); + tree->localTree.setNDust((int)dustPositions.size()); + tree->allocateDustMemory(tree->localTree); + + //Load dust data onto the device + for(uint i=0; i < dustPositions.size(); i++) + { + tree->localTree.dust_pos[i] = dustPositions[i]; + tree->localTree.dust_vel[i] = dustVelocities[i]; + tree->localTree.dust_ids[i] = dustIDs[i]; + } + + tree->localTree.dust_pos.h2d(); + tree->localTree.dust_vel.h2d(); + tree->localTree.dust_ids.h2d(); + } +#endif //ifdef USE_DUST + + +#ifdef USE_MPI + //Startup the OMP threads + omp_set_num_threads(4); +#endif + + + //Start the integration +#ifdef USE_OPENGL + octree::IterationData idata; + initAppRenderer(argc, argv, tree, idata, displayFPS, stereo); + LOG("Finished!!! Took in total: %lg sec\n", tree->get_time()-t0); +#else + tree->mpiSync(); + if (procId==0) + fprintf(stderr, " Starting iterating\n"); + tree->mpiSync(); + tree->iterate(); + + LOG("Finished!!! Took in total: %lg sec\n", tree->get_time()-t0); + + logFile.close(); + +#ifdef USE_MPI + MPI_Finalize(); +#endif + + if(tree->procId == 0) + { + LOGF(stderr, "TOTAL: Time spent between the start of 'iterate' and the final time-step (very first step is not accounted)\n",0); + LOGF(stderr, "Grav: Time spent to compute gravity, including communication (wall-clock time)\n",0); + LOGF(stderr, "GPUgrav: Time spent ON the GPU to compute local and LET gravity\n",0); + LOGF(stderr, "LET Com: Time spent in exchanging and building LET data\n",0); + LOGF(stderr, "Build: Time spent in constructing the tree (incl sorting, making groups, etc.)\n",0); + LOGF(stderr, "Domain: Time spent in computing new domain decomposition and exchanging particles between nodes.\n",0); + LOGF(stderr, "Wait: Time spent in waiting on other processes after the gravity part.\n",0); + } + + + delete tree; + tree = NULL; +#endif + + displayTimers(); + return 0; +} diff --git a/ptxtools/ptx.ll b/ptxtools/ptx.ll new file mode 100644 index 00000000..0c3a0fd4 --- /dev/null +++ b/ptxtools/ptx.ll @@ -0,0 +1,79 @@ +%option yylineno +%option noyywrap +%option yyclass="parser::PTXLexer" +%option prefix="ptx" +%option c++ + +%{ +#include "PTXLexer.h" +#include +#include +#include +#ifdef LLSETTOKEN +#error "TOKEN is defined" +#endif +#define LLSETTOKEN(tok) yylval->ivalue = tok; return tok; +%} + +COMMENT ("//"[^\n]*) +TAB [\t]* + +%% +{COMMENT} {nextColumn += strlen(yytext); /* lCppComment(&yylloc); */ } +".version" { return TOKEN_VERSION; } +".target" { return TOKEN_TARGET; } +".address_size" { return TOKEN_ADDRESS_SIZE; } +".func" { return TOKEN_FUNC; } +".entry" { return TOKEN_ENTRY; } +".align" { return TOKEN_ALIGN; } +".visible" { return TOKEN_VISIBLE; } +".global" { return TOKEN_GLOBAL; } +".param" { return TOKEN_PARAM; } +".b0" { LLSETTOKEN( TOKEN_B32);} /* fix for buggy llvm-ptx generator */ +".b8" { LLSETTOKEN( TOKEN_B8);} +".b16" { LLSETTOKEN( TOKEN_B16);} +".b32" { LLSETTOKEN( TOKEN_B32);} +".b64" { LLSETTOKEN( TOKEN_B64);} +".u8" { LLSETTOKEN( TOKEN_U8);} +".u16" { LLSETTOKEN( TOKEN_U16);} +".u32" { LLSETTOKEN( TOKEN_U32);} +".u64" { LLSETTOKEN( TOKEN_U64);} +".s8" { LLSETTOKEN( TOKEN_S8);} +".s16" { LLSETTOKEN( TOKEN_S16);} +".s32" { LLSETTOKEN( TOKEN_S32);} +".s64" { LLSETTOKEN( TOKEN_S64);} +".f32" { LLSETTOKEN( TOKEN_F32);} +".f64" { LLSETTOKEN( TOKEN_F64);} +"[" { return '[';} +"]" { return ']';} +"(" { return '(';} +")" { return ')';} +"," { return ',';} +";" { return ';';} +"=" { return '=';} +[0-9]+\.[0-9]+ { yylval->fvalue = atof(yytext); return TOKEN_FLOAT; } +[0-9]+ { yylval->ivalue = atoi(yytext); return TOKEN_INT; } +[a-zA-Z0-9_]+ { strcpy(yylval->svalue, yytext); return TOKEN_STRING;} +\n { + // yylloc.last_line++; +// yylloc.last_column = 1; + nextColumn = 1; +} +. ; +%% + +/** Handle a C++-style comment--eat everything up until the end of the line. + */ +#if 0 +static void +lCppComment(SourcePos *pos) { + char c; + do { + c = yyinput(); + } while (c != 0 && c != '\n'); + if (c == '\n') { + pos->last_line++; + pos->last_column = 1; + } +} +#endif diff --git a/ptxtools/ptxcc.cpp b/ptxtools/ptxcc.cpp new file mode 100644 index 00000000..dad7452d --- /dev/null +++ b/ptxtools/ptxcc.cpp @@ -0,0 +1,275 @@ +#include +#include +#include +#include +#include +#include +#include +#include "PTXParser.h" + + +/* + * The C++ code below is based on the following bash-script: + #!/bin/sh + + PTXSRC=$1__tmp_ptx.ptx + PTXCU=$1___tmp_ptx.cu + PTXSH=$1___tmp_ptx.sh + + NVCCPARM=${@:2} + + DEPTX=dePTX + NVCC=nvcc + + $(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) && + $DEPTX < $PTXSRC > $PTXCU && + $NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \ + sed 's/\#\$//g'| \ + awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH && + sh $PTXSH + + # rm $PTXCU $PTXSH + * + */ + +static char lRandomAlNum() +{ + const char charset[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + const size_t max_index = (sizeof(charset) - 1); + return charset[ rand() % max_index ]; +} + +static std::string lRandomString(const size_t length) +{ + timeval t1; + gettimeofday(&t1, NULL); + srand(t1.tv_usec * t1.tv_sec); + std::string str(length,0); + std::generate_n( str.begin(), length, lRandomAlNum); + return str; +} + +static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) +{ + // Copy over the command line arguments (passed in) + for (int i = 0; i < Argc; ++i) + argv[i] = Argv[i]; + argc = Argc; +} +const char *lGetExt (const char *fspec) +{ + char *e = strrchr (fspec, '.'); + return e; +} + +static std::vector lSplitString(const std::string &s, char delim) +{ + std::vector elems; + std::stringstream ss(s); + std::string item; + while (std::getline(ss, item, delim)) { + if (!item.empty()) + elems.push_back(item); + } + return elems; +} + +static void lUsage(const int ret) +{ + fprintf(stdout, "\nusage: ptxcc [options] file.ptx \n"); + fprintf(stdout, " [--help]\t\t\t\t This help\n"); + fprintf(stdout, " [--verbose]\t\t\t\t Be verbose\n"); + fprintf(stdout, " [--arch={%s}]\t\t\t GPU target architecture\n", "sm_35"); + fprintf(stdout, " [-o ]\t\t\t\t Output file name\n"); + fprintf(stdout, " [-Xnvcc=]\t\t Arguments to pass through to \"nvcc\"\n"); + fprintf(stdout, " \n"); + exit(ret); +} + +int main(int _argc, char * _argv[]) +{ + int argc; + char *argv[128]; + lGetAllArgs(_argc, _argv, argc, argv); + + std::string arch="sm_35"; + std::string filePTX; + std::string fileOBJ; + std::string extString = ".ptx"; + bool keepTemporaries = false; + bool verbose = false; + std::string nvccArguments; + + for (int i = 1; i < argc; ++i) + { + if (!strcmp(argv[i], "--help")) + lUsage(0); + else if (!strncmp(argv[i], "--arch=", 7)) + arch = std::string(argv[i]+7); + else if (!strncmp(argv[i], "--keep-temporaries", 11)) + keepTemporaries = true; + else if (!strncmp(argv[i], "--verbose", 9)) + verbose = true; + else if (!strncmp(argv[i], "-Xnvcc=", 7)) + nvccArguments = std::string(argv[i]+7); + else if (!strcmp(argv[i], "-o")) + { + if (++i == argc) + { + fprintf(stderr, "No output file specified after -o option.\n"); + lUsage(1); + } + fileOBJ = std::string(argv[i]); + } + else + { + const char * ext = strrchr(argv[i], '.'); + if (ext == NULL) + { + fprintf(stderr, " Unknown argument: %s \n", argv[i]); + lUsage(1); + } + else if (strncmp(ext, extString.c_str(), 4)) + { + fprintf(stderr, " Unkown extension of the input file: %s \n", ext); + lUsage(1); + } + else if (filePTX.empty()) + { + filePTX = std::string(argv[i]); + if (fileOBJ.empty()) + { + char * baseName = argv[i]; + while (baseName != ext) + fileOBJ += std::string(baseName++,1); + } + fileOBJ += ".o"; + } + } + } +#if 0 + fprintf(stderr, " fileOBJ= %s\n", fileOBJ.c_str()); + fprintf(stderr, " arch= %s\n", arch.c_str()); + fprintf(stderr, " file= %s\n", filePTX.empty() ? "$stdin" : filePTX.c_str()); + fprintf(stderr, " num_args= %d\n", (int)nvccArgumentList.size()); + for (int i= 0; i < (int)nvccArgumentList.size(); i++) + fprintf(stderr, " arg= %d : %s \n", i, nvccArgumentList[i].c_str()); +#endif + assert(arch == std::string("sm_35")); + if (filePTX.empty()) + { + fprintf(stderr, "ptxcc fatal : No input file specified; use option --help for more information\n"); + exit(1); + } + + // open a file handle to a particular file: + std::ifstream inputPTX(filePTX.c_str()); + if (!inputPTX) + { + fprintf(stderr, "ptxcc: error: %s: No such file\n", filePTX.c_str()); + exit(1); + } + + std::string randomBaseName = std::string("/tmp/") + lRandomString(8) + "_" + lSplitString(lSplitString(filePTX,'/').back(),'.')[0]; + if (verbose) + fprintf(stderr, "baseFileName= %s\n", randomBaseName.c_str()); + + std::string fileCU= randomBaseName + ".cu"; + std::ofstream outputCU(fileCU.c_str()); + assert(outputCU); + + std::istream & input = inputPTX; + std::ostream & output = outputCU; + std::ostream & error = std::cerr; + parser::PTXLexer lexer(&input, &error); + parser::PTXParser state(output); + + // parse through the input until there is no more: + // + + do { + ptx::yyparse(lexer, state); + } + while (!input.eof()); + + inputPTX.close(); + outputCU.close(); + + // process output from nvcc + // + /* nvcc -dc -arch=$arch -dryrun -argumentlist fileCU */ + + std::string fileSH= randomBaseName + ".sh"; + + std::string nvccExe("nvcc"); + std::string nvccCmd; + nvccCmd += nvccExe + std::string(" "); + nvccCmd += "-dc "; + nvccCmd += std::string("-arch=") + arch + std::string(" "); + nvccCmd += "-dryrun "; + nvccCmd += nvccArguments + std::string(" "); + nvccCmd += std::string("-o ") + fileOBJ + std::string(" "); + nvccCmd += fileCU + std::string(" "); + nvccCmd += std::string("2> ") + fileSH; + if (verbose) + fprintf(stderr , "%s\n", nvccCmd.c_str()); + const int nvccRet = std::system(nvccCmd.c_str()); + if (nvccRet) + fprintf(stderr, "FAIL: %s\n", nvccCmd.c_str()); + + + std::ifstream inputSH(fileSH.c_str()); + assert(inputSH); + std::vector nvccSteps; + while (!inputSH.eof()) + { + nvccSteps.push_back(std::string()); + std::getline(inputSH, nvccSteps.back()); + if (nvccRet) + fprintf(stderr, " %s\n", nvccSteps.back().c_str()); + } + inputSH.close(); + if (nvccRet) + exit(-1); + + + for (int i = 0; i < (int)nvccSteps.size(); i++) + { + std::string cmd = nvccSteps[i]; + for (int j = 0; j < (int)cmd.size()-1; j++) + if (cmd[j] == '#' && cmd[j+1] == '$') + cmd[j] = cmd[j+1] = ' '; + std::vector splitCmd = lSplitString(cmd, ' '); + + if (!splitCmd.empty()) + { + if (splitCmd[0] == std::string("cicc")) + cmd = std::string(" cp ") + filePTX + std::string(" ") + splitCmd.back(); + if (splitCmd[0] == std::string("LIBRARIES=")) + cmd = ""; + } + nvccSteps[i] = cmd; + if (verbose) + fprintf(stderr, "%3d: %s\n", i, cmd.c_str()); + const int ret = std::system(cmd.c_str()); + if (ret) + { + fprintf(stderr, " Something went wrong .. \n"); + for (int j = 0; j < i; j++) + fprintf(stderr, "PASS: %s\n", nvccSteps[j].c_str()); + fprintf(stderr, "FAIL: %s\n", nvccSteps[i].c_str()); + exit(-1); + } + } + + if (!keepTemporaries) + { + /* remove temporaries */ + } + + + +} diff --git a/ptxtools/ptxgen.cpp b/ptxtools/ptxgen.cpp new file mode 100644 index 00000000..a05124b6 --- /dev/null +++ b/ptxtools/ptxgen.cpp @@ -0,0 +1,408 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +template +static std::string lValueToString(const T& value) +{ + std::ostringstream oss; + oss << value; + return oss.str(); +} + +typedef struct stat Stat; + + +#define PTXGENStatus int +enum { + PTXGEN_SUCCESS = 0x0000, + PTXGEN_FILE_IO_ERROR = 0x0001, + PTXGEN_BAD_ALLOC_ERROR = 0x0002, + PTXGEN_LIBNVVM_COMPILATION_ERROR = 0x0004, + PTXGEN_LIBNVVM_ERROR = 0x0008, + PTXGEN_INVALID_USAGE = 0x0010, + PTXGEN_LIBNVVM_HOME_UNDEFINED = 0x0020, + PTXGEN_LIBNVVM_VERIFICATION_ERROR = 0x0040 +}; + +static PTXGENStatus getLibDeviceName(const int computeArch, std::string &libDeviceName) +{ + const char *env = getenv("LIBNVVM_HOME"); +#ifdef LIBNVVM_HOME +#define STRINGIFY(x) #x +#define TOSTRING(x) STRINGIFY(x) + const std::string libnvvmPath(env ? env : TOSTRING(LIBNVVM_HOME)); +#undef TOSTRING +#undef STRINGIFY +#else + const std::string libnvvmPath(env); +#endif + + if (libnvvmPath.empty()) + { + fprintf(stderr, "The environment variable LIBNVVM_HOME is undefined\n"); + return PTXGEN_LIBNVVM_HOME_UNDEFINED; + } + + /* Use libdevice for compute_20, if the target is not compute_20, compute_30, + * or compute_35. */ + const std::string libdevice = + std::string("/libdevice/libdevice.compute_") + + lValueToString(computeArch)+ "." + + lValueToString(LIBDEVICE_MAJOR_VERSION) + + lValueToString(LIBDEVICE_MINOR_VERSION) + + ".bc"; + + libDeviceName = libnvvmPath + libdevice; + + return PTXGEN_SUCCESS; +} + +static PTXGENStatus addFileToProgram(const std::string &filename, nvvmProgram prog) +{ + char *buffer; + size_t size; + Stat fileStat; + + /* Open the input file. */ + FILE *f = fopen(filename.c_str(), "rb"); + if (f == NULL) { + fprintf(stderr, "Failed to open %s\n", filename.c_str()); + return PTXGEN_FILE_IO_ERROR; + } + + /* Allocate buffer for the input. */ + fstat(fileno(f), &fileStat); + buffer = (char *) malloc(fileStat.st_size); + if (buffer == NULL) { + fprintf(stderr, "Failed to allocate memory\n"); + return PTXGEN_BAD_ALLOC_ERROR; + } + size = fread(buffer, 1, fileStat.st_size, f); + if (ferror(f)) { + fprintf(stderr, "Failed to read %s\n", filename.c_str()); + fclose(f); + free(buffer); + return PTXGEN_FILE_IO_ERROR; + } + fclose(f); + + if (nvvmAddModuleToProgram(prog, buffer, size, filename.c_str()) != NVVM_SUCCESS) { + fprintf(stderr, + "Failed to add the module %s to the compilation unit\n", + filename.c_str()); + free(buffer); + return PTXGEN_LIBNVVM_ERROR; + } + + free(buffer); + return PTXGEN_SUCCESS; +} + +static PTXGENStatus generatePTX( + std::vector nvvmOptions, + std::vector nvvmFiles, + std::ostream &out, + const int computeArch) +{ + nvvmProgram prog; + PTXGENStatus status; + + /* Create the compiliation unit. */ + if (nvvmCreateProgram(&prog) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to create the compilation unit\n"); + return PTXGEN_LIBNVVM_ERROR; + } + + + /* Add libdevice. */ + std::string libDeviceName; + status = getLibDeviceName(computeArch, libDeviceName); + if (status != PTXGEN_SUCCESS) + { + nvvmDestroyProgram(&prog); + return status; + } + status = addFileToProgram(libDeviceName, prog); + if (status != PTXGEN_SUCCESS) + { + fprintf(stderr, "Please double-check LIBNVVM_HOME environmental variable.\n"); + nvvmDestroyProgram(&prog); + return status; + } + + /* Add the module to the compilation unit. */ + for (int i = 0; i < (int)nvvmFiles.size(); ++i) + { + status = addFileToProgram(nvvmFiles[i], prog); + if (status != PTXGEN_SUCCESS) + { + nvvmDestroyProgram(&prog); + return status; + } + } + + const int numOptions = nvvmOptions.size(); + std::vector options(numOptions); + for (int i = 0; i < numOptions; i++) + options[i] = nvvmOptions[i].c_str(); + + /* Verify the compilation unit. */ + if (nvvmVerifyProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to verify the compilation unit\n"); + status |= PTXGEN_LIBNVVM_VERIFICATION_ERROR; + } + + /* Print warnings and errors. */ + { + size_t logSize; + if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the compilation log size\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + std::string log(logSize,0); + if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the compilation log\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + fprintf(stderr, "%s\n", log.c_str()); + } + } + } + + if (status & PTXGEN_LIBNVVM_VERIFICATION_ERROR) + { + nvvmDestroyProgram(&prog); + return status; + } + + /* Compile the compilation unit. */ + if (nvvmCompileProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to generate PTX from the compilation unit\n"); + status |= PTXGEN_LIBNVVM_COMPILATION_ERROR; + } + else + { + size_t ptxSize; + if (nvvmGetCompiledResultSize(prog, &ptxSize) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the PTX output size\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + std::string ptx(ptxSize,0); + if (nvvmGetCompiledResult(prog, &ptx[0]) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the PTX output\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + out << ptx; + } + } + } + + /* Print warnings and errors. */ + { + size_t logSize; + if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the compilation log size\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + std::string log(logSize,0); + if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) + { + fprintf(stderr, "Failed to get the compilation log\n"); + status |= PTXGEN_LIBNVVM_ERROR; + } + else + { + fprintf(stderr, "%s\n", log.c_str()); + } + } + } + + /* Release the resources. */ + nvvmDestroyProgram(&prog); + + return PTXGEN_SUCCESS; +} + +static void showUsage() +{ + fprintf(stderr,"Usage: ptxgen [OPTION]... [FILE]...\n" + " [FILE] could be a .bc file or a .ll file\n"); +} + +static void lUsage(const int ret) +{ + fprintf(stdout, "\nusage: ptxgen [options] file.[ll,bc] \n"); + fprintf(stdout, " [--help]\t\t This help\n"); + fprintf(stdout, " [--verbose]\t\t Be verbose\n"); + fprintf(stdout, " [--arch={%s}]\t GPU target architecture\n", "sm_35"); + fprintf(stdout, " [-o ]\t\t Output file name\n"); + fprintf(stdout, " [-g]\t\t Enable generation of debuggin information \n"); + fprintf(stdout, " [--opt=]\t\t Optimization parameters \n"); + fprintf(stdout, " \t\t\t 0 - disable optimizations \n"); + fprintf(stdout, " \t\t\t 3 - defalt, enable optimizations \n"); + fprintf(stdout, " [--ftz=]\t\t Flush-to-zero mode when performsing single-precision floating-point operations\n"); + fprintf(stdout, " \t\t\t 0 - default, preserve denormal values\n"); + fprintf(stdout, " \t\t\t 1 - flush denormal values to zero\n"); + fprintf(stdout, " [--prec-sqrt=]\t Precision mode for single-precision floating-point square root\n"); + fprintf(stdout, " \t\t\t 0 - use a faster approximation\n"); + fprintf(stdout, " \t\t\t 1 - default, use IEEE round-to-nearest mode\n"); + fprintf(stdout, " [--prec-div=]\t Precision mode for single-precision floating-point division and reciprocals\n"); + fprintf(stdout, " \t\t\t 0 - use a faster approximation\n"); + fprintf(stdout, " \t\t\t 1 - default, use IEEE round-to-nearest mode\n"); + fprintf(stdout, " [--fma=]\t\t FMA contraction mode \n"); + fprintf(stdout, " \t\t\t 0 - disable\n"); + fprintf(stdout, " \t\t\t 1 - default, enable\n"); + fprintf(stdout, " [--use_fast_math]\t Make use of fast maih. Implies --ftz=1 --prec-div=0 --prec-sqrt=0\n"); + fprintf(stdout, " \n"); + exit(ret); +} + +int main(int argc, char *argv[]) +{ + int _opt = 3; + int _ftz = 0; + int _precSqrt = 1; + int _precDiv = 1; + int _fma = 1; + bool _useFastMath = false; + bool _debug = false; + bool _verbose = false; + std::string _arch = "sm_35"; + std::string fileIR, filePTX; + + for (int i = 1; i < argc; ++i) + { + if (!strcmp(argv[i], "--help")) + lUsage(0); + else if (!strncmp(argv[i], "--arch=", 7)) + _arch = std::string(argv[i]+7); + else if (!strncmp(argv[i], "-g", 2)) + _debug = true; + else if (!strncmp(argv[i], "--verbose", 9)) + _verbose = true; + else if (!strncmp(argv[i], "--opt=", 6)) + _opt = atoi(argv[i]+6); + else if (!strncmp(argv[i], "--ftz=", 6)) + _ftz = atoi(argv[i]+6); + else if (!strncmp(argv[i], "--prec-sqrt=", 12)) + _precSqrt = atoi(argv[i]+12); + else if (!strncmp(argv[i], "--prec-div=", 11)) + _precDiv = atoi(argv[i]+11); + else if (!strncmp(argv[i], "--fma=", 6)) + _fma = atoi(argv[i]+6); + else if (!strncmp(argv[i], "--use_fast_math", 15)) + _useFastMath = true; + else if (!strcmp(argv[i], "-o")) + { + if (++i == argc) + { + fprintf(stderr, "No output file specified after -o option.\n"); + lUsage(1); + } + filePTX = std::string(argv[i]); + } + else + { + const char * ext = strrchr(argv[i], '.'); + if (ext == NULL) + { + fprintf(stderr, " Unknown argument: %s \n", argv[i]); + lUsage(1); + } + else if (strncmp(ext, ".ll", 3) && strncmp(ext, ".bc", 3)) + { + fprintf(stderr, " Unkown extension of the input file: %s \n", ext); + lUsage(1); + } + else if (filePTX.empty()) + { + fileIR = std::string(argv[i]); + if (filePTX.empty()) + { + char * baseName = argv[i]; + while (baseName != ext) + filePTX += std::string(baseName++,1); + } + filePTX += ".ptx"; + } + } + } + + if (fileIR.empty()) + { + fprintf(stderr, "ptxgen fatal : No input file specified; use option --help for more information\n"); + exit(1); + } + +#if 0 + fprintf(stderr, "fileIR= %s\n", fileIR.c_str()); + fprintf(stderr, "filePTX= %s\n", filePTX.c_str()); + fprintf(stderr, "arch= %s\n", _arch.c_str()); + fprintf(stderr, "debug= %s\n", _debug ? "true" : "false"); + fprintf(stderr, "verbose= %s\n", _verbose ? "true" : "false"); + fprintf(stderr, "opt= %d\n", _opt); + fprintf(stderr, "ftz= %d\n", _ftz); + fprintf(stderr, "prec-sqrt= %d\n", _precSqrt); + fprintf(stderr, "prec-div= %d\n", _precDiv); + fprintf(stderr, "fma= %d\n", _fma); + fprintf(stderr, "use_fast_math= %s\n", _useFastMath ? "true" : "false"); +#endif + + int computeArch = 35; + assert(_arch == std::string("sm_35")); + + if (_useFastMath) + { + _ftz = 1; + _precSqrt = _precDiv = 0; + } + + std::vector nvvmOptions; + nvvmOptions.push_back("-arch=compute_35"); + nvvmOptions.push_back("-ftz=" + lValueToString(_ftz)); + nvvmOptions.push_back("-prec-sqrt=" + lValueToString(_precSqrt)); + nvvmOptions.push_back("-prec-div=" + lValueToString(_precDiv)); + nvvmOptions.push_back("-fma=" + lValueToString(_fma)); + if (_debug) + nvvmOptions.push_back("-g"); + + std::vector nvvmFiles; + nvvmFiles.push_back(fileIR); + + std::ofstream outputPTX(filePTX.c_str()); + assert(outputPTX); + + const int ret = generatePTX(nvvmOptions, nvvmFiles, outputPTX, computeArch); + outputPTX.open(filePTX.c_str()); + return ret; +} + diff --git a/ptxtools/ptxgrammar.cpp b/ptxtools/ptxgrammar.cpp new file mode 100644 index 00000000..2419e654 --- /dev/null +++ b/ptxtools/ptxgrammar.cpp @@ -0,0 +1,1929 @@ + +/* A Bison parser, made by GNU Bison 2.4.1. */ + +/* Skeleton implementation for Bison's Yacc-like parsers in C + + Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006 + Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.4.1" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 1 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + +/* Using locations. */ +#define YYLSP_NEEDED 1 + + + +/* Copy the first part of user declarations. */ + +/* Line 189 of yacc.c */ +#line 3 "ptxgrammar.yy" + + #include + #include "PTXParser.h" + #include "PTXLexer.h" + #include + #include + #include + + #define YYERROR_VERBOSE 1 + + #ifdef REPORT_BASE + #undef REPORT_BASE + #endif + + #define REPORT_BASE 0 + + namespace ptx + { + + int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state ); + void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state, char const* message ); + + std::string yyTypeToString( int ); + + + +/* Line 189 of yacc.c */ +#line 102 "ptxgrammar.cc" + +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 1 +#endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* Enabling the token table. */ +#ifndef YYTOKEN_TABLE +# define YYTOKEN_TABLE 0 +#endif + + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + TOKEN_VERSION = 258, + TOKEN_TARGET = 259, + TOKEN_ADDRESS_SIZE = 260, + TOKEN_VISIBLE = 261, + TOKEN_FUNC = 262, + TOKEN_ENTRY = 263, + TOKEN_PARAM = 264, + TOKEN_ALIGN = 265, + TOKEN_GLOBAL = 266, + TOKEN_B8 = 267, + TOKEN_B16 = 268, + TOKEN_B32 = 269, + TOKEN_B64 = 270, + TOKEN_U8 = 271, + TOKEN_U16 = 272, + TOKEN_U32 = 273, + TOKEN_U64 = 274, + TOKEN_S8 = 275, + TOKEN_S16 = 276, + TOKEN_S32 = 277, + TOKEN_S64 = 278, + TOKEN_F32 = 279, + TOKEN_F64 = 280, + TOKEN_INT = 281, + TOKEN_FLOAT = 282, + TOKEN_STRING = 283 + }; +#endif + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ + +/* Line 214 of yacc.c */ +#line 32 "ptxgrammar.yy" + + char svalue[1024]; + double fvalue; + int ivalue; + unsigned int uvalue; + + + +/* Line 214 of yacc.c */ +#line 175 "ptxgrammar.cc" +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + +#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED +typedef struct YYLTYPE +{ + int first_line; + int first_column; + int last_line; + int last_column; +} YYLTYPE; +# define yyltype YYLTYPE /* obsolescent; will be withdrawn */ +# define YYLTYPE_IS_DECLARED 1 +# define YYLTYPE_IS_TRIVIAL 1 +#endif + + +/* Copy the second part of user declarations. */ + + +/* Line 264 of yacc.c */ +#line 200 "ptxgrammar.cc" + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(msgid) dgettext ("bison-runtime", msgid) +# endif +# endif +# ifndef YY_ +# define YY_(msgid) msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(e) ((void) (e)) +#else +# define YYUSE(e) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(n) (n) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined _STDLIB_H \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef _STDLIB_H +# define _STDLIB_H 1 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL \ + && defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; + YYLTYPE yyls_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE) + sizeof (YYLTYPE)) \ + + 2 * YYSTACK_GAP_MAXIMUM) + +/* Copy COUNT objects from FROM to TO. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(To, From, Count) \ + __builtin_memcpy (To, From, (Count) * sizeof (*(From))) +# else +# define YYCOPY(To, From, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (To)[yyi] = (From)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 6 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 97 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 34 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 28 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 66 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 92 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 283 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 31, 32, 2, 2, 33, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 29, 2, 30, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint8 yyprhs[] = +{ + 0, 0, 3, 6, 10, 13, 16, 19, 21, 23, + 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, + 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 80, 82, 85, + 87, 90, 92, 95, 97, 101, 102, 104, 106, 108, + 111, 113, 116, 121, 123, 125, 127, 128, 132, 136, + 141, 143, 145, 149, 151, 152, 158 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 35, 0, -1, 36, 43, -1, 37, 38, 39, -1, + 3, 27, -1, 4, 28, -1, 5, 26, -1, 16, + -1, 17, -1, 18, -1, 19, -1, 20, -1, 21, + -1, 22, -1, 23, -1, 12, -1, 13, -1, 14, + -1, 15, -1, 24, -1, 25, -1, 40, -1, 10, + -1, 9, -1, 40, -1, 28, -1, 27, -1, 26, + -1, 7, -1, 8, -1, 11, -1, 29, -1, 30, + -1, 31, -1, 32, -1, 33, -1, 43, 60, -1, + 60, -1, 43, 55, -1, 55, -1, 43, 61, -1, + 61, -1, 43, 42, -1, 42, -1, 29, 26, 30, + -1, -1, 44, -1, 28, -1, 9, -1, 10, 26, + -1, 41, -1, 48, 41, -1, 47, 49, 46, 45, + -1, 31, -1, 32, -1, 50, -1, -1, 53, 33, + 50, -1, 51, 53, 52, -1, 6, 8, 46, 54, + -1, 31, -1, 32, -1, 56, 53, 57, -1, 58, + -1, -1, 6, 7, 59, 46, 54, -1, 6, 11, + 49, 46, 44, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint8 yyrline[] = +{ + 0, 71, 71, 74, 81, 83, 85, 89, 89, 89, + 89, 90, 90, 90, 90, 91, 91, 91, 91, 92, + 92, 94, 97, 98, 99, 100, 100, 100, 101, 101, + 102, 103, 104, 105, 106, 107, 110, 110, 111, 111, + 112, 112, 113, 113, 117, 120, 121, 123, 124, 126, + 127, 128, 130, 136, 137, 138, 139, 140, 141, 143, + 148, 149, 150, 151, 151, 152, 158 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "TOKEN_VERSION", "TOKEN_TARGET", + "TOKEN_ADDRESS_SIZE", "TOKEN_VISIBLE", "TOKEN_FUNC", "TOKEN_ENTRY", + "TOKEN_PARAM", "TOKEN_ALIGN", "TOKEN_GLOBAL", "TOKEN_B8", "TOKEN_B16", + "TOKEN_B32", "TOKEN_B64", "TOKEN_U8", "TOKEN_U16", "TOKEN_U32", + "TOKEN_U64", "TOKEN_S8", "TOKEN_S16", "TOKEN_S32", "TOKEN_S64", + "TOKEN_F32", "TOKEN_F64", "TOKEN_INT", "TOKEN_FLOAT", "TOKEN_STRING", + "'['", "']'", "'('", "')'", "','", "$accept", "ptxsource", "header", + "version", "target", "address_size", "dataTypeId", "dataType", + "anytoken", "ptxbody", "arrayDimensionSet", "arrayDimensions", + "identifier", "parameter", "alignment", "addressableVariablePrefix", + "argumentDeclaration", "argumentListBegin", "argumentListEnd", + "argumentListBody", "argumentList", "visibleEntryDeclaration", + "returnArgumentListBegin", "returnArgumentListEnd", "returnArgumentList", + "optionalReturnArgumentList", "visibleFunctionDeclaration", + "visibleInitializableDeclaration", 0 +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 282, 283, 91, + 93, 40, 41, 44 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 34, 35, 36, 37, 38, 39, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 40, 41, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, + 43, 43, 43, 43, 44, 45, 45, 46, 47, 48, + 49, 49, 50, 51, 52, 53, 53, 53, 54, 55, + 56, 57, 58, 59, 59, 60, 61 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 2, 3, 2, 2, 2, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, + 2, 1, 2, 1, 3, 0, 1, 1, 1, 2, + 1, 2, 4, 1, 1, 1, 0, 3, 3, 4, + 1, 1, 3, 1, 0, 5, 5 +}; + +/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state + STATE-NUM when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 0, 0, 0, 0, 0, 4, 1, 0, 28, 29, + 23, 22, 30, 15, 16, 17, 18, 7, 8, 9, + 10, 11, 12, 13, 14, 19, 20, 27, 26, 25, + 31, 32, 33, 34, 35, 24, 43, 2, 39, 37, + 41, 0, 0, 64, 0, 0, 42, 38, 36, 40, + 5, 0, 3, 60, 56, 63, 0, 47, 0, 0, + 21, 50, 0, 0, 6, 48, 0, 55, 0, 0, + 53, 56, 59, 49, 51, 0, 0, 61, 0, 62, + 65, 0, 0, 66, 45, 57, 54, 58, 0, 46, + 52, 44 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + -1, 2, 3, 4, 42, 52, 60, 61, 36, 37, + 83, 90, 58, 66, 62, 63, 67, 71, 87, 68, + 72, 38, 54, 79, 55, 56, 39, 40 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -28 +static const yytype_int8 yypact[] = +{ + 27, 6, 41, -5, 39, -28, -28, 24, -28, -28, + -28, -28, -28, -28, -28, -28, -28, -28, -28, -28, + -28, -28, -28, -28, -28, -28, -28, -28, -28, -28, + -28, -28, -28, -28, -28, -28, -28, -5, -28, -28, + -28, 14, 40, 13, 18, 38, -28, -28, -28, -28, + -28, 21, -28, -28, 69, -28, 18, -28, 48, 54, + -28, -28, 52, 18, -28, -28, 38, -28, 5, 48, + -28, 69, -28, -28, -28, 53, 18, -28, 69, -28, + -28, 7, 55, -28, 53, -28, -28, -28, 56, -28, + -28, -28 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -28, -28, -28, -28, -28, -28, -3, 22, 46, -28, + 1, -28, -27, -28, -28, 23, 9, -28, -28, 17, + 25, 58, -28, -28, -28, -28, 59, 60 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If zero, do what YYDEFACT says. + If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint8 yytable[] = +{ + 35, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31, 32, 33, 34, 69, + 1, 43, 44, 5, 35, 45, 75, 77, 78, 86, + 78, 6, 50, 41, 53, 51, 57, 64, 59, 84, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 65, 70, + 73, 88, 82, 46, 74, 89, 91, 85, 81, 76, + 0, 0, 0, 0, 80, 47, 48, 49 +}; + +static const yytype_int8 yycheck[] = +{ + 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 56, + 3, 7, 8, 27, 37, 11, 63, 32, 33, 32, + 33, 0, 28, 4, 31, 5, 28, 26, 10, 76, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 9, 31, + 26, 26, 29, 37, 62, 84, 30, 78, 71, 66, + -1, -1, -1, -1, 69, 37, 37, 37 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 3, 35, 36, 37, 27, 0, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32, 33, 40, 42, 43, 55, 60, + 61, 4, 38, 7, 8, 11, 42, 55, 60, 61, + 28, 5, 39, 31, 56, 58, 59, 28, 46, 10, + 40, 41, 48, 49, 26, 9, 47, 50, 53, 46, + 31, 51, 54, 26, 41, 46, 49, 32, 33, 57, + 54, 53, 29, 44, 46, 50, 32, 52, 26, 44, + 45, 30 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. */ + +#define YYFAIL goto yyerrlab + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY && yylen == 1) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + yytoken = YYTRANSLATE (yychar); \ + YYPOPSTACK (1); \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (&yylloc, lexer, state, YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + + +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N]. + If N is 0, then set CURRENT to the empty location which ends + the previous symbol: RHS[0] (always defined). */ + +#define YYRHSLOC(Rhs, K) ((Rhs)[K]) +#ifndef YYLLOC_DEFAULT +# define YYLLOC_DEFAULT(Current, Rhs, N) \ + do \ + if (YYID (N)) \ + { \ + (Current).first_line = YYRHSLOC (Rhs, 1).first_line; \ + (Current).first_column = YYRHSLOC (Rhs, 1).first_column; \ + (Current).last_line = YYRHSLOC (Rhs, N).last_line; \ + (Current).last_column = YYRHSLOC (Rhs, N).last_column; \ + } \ + else \ + { \ + (Current).first_line = (Current).last_line = \ + YYRHSLOC (Rhs, 0).last_line; \ + (Current).first_column = (Current).last_column = \ + YYRHSLOC (Rhs, 0).last_column; \ + } \ + while (YYID (0)) +#endif + + +/* YY_LOCATION_PRINT -- Print the location on the stream. + This macro was not mandated originally: define only if we know + we won't break user code: when these are the locations we know. */ + +#ifndef YY_LOCATION_PRINT +# if YYLTYPE_IS_TRIVIAL +# define YY_LOCATION_PRINT(File, Loc) \ + fprintf (File, "%d.%d-%d.%d", \ + (Loc).first_line, (Loc).first_column, \ + (Loc).last_line, (Loc).last_column) +# else +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +# endif +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ + +#ifdef YYLEX_PARAM +# define YYLEX yylex (&yylval, &yylloc, YYLEX_PARAM) +#else +# define YYLEX yylex (&yylval, &yylloc, lexer, state) +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value, Location, lexer, state); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, YYLTYPE const * const yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; + YYLTYPE const * const yylocationp; + parser::PTXLexer& lexer; + parser::PTXParser& state; +#endif +{ + if (!yyvaluep) + return; + YYUSE (yylocationp); + YYUSE (lexer); + YYUSE (state); +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, YYLTYPE const * const yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; + YYLTYPE const * const yylocationp; + parser::PTXLexer& lexer; + parser::PTXParser& state; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + YY_LOCATION_PRINT (yyoutput, *yylocationp); + YYFPRINTF (yyoutput, ": "); + yy_symbol_value_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, YYLTYPE *yylsp, int yyrule, parser::PTXLexer& lexer, parser::PTXParser& state) +#else +static void +yy_reduce_print (yyvsp, yylsp, yyrule, lexer, state) + YYSTYPE *yyvsp; + YYLTYPE *yylsp; + int yyrule; + parser::PTXLexer& lexer; + parser::PTXParser& state; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + , &(yylsp[(yyi + 1) - (yynrhs)]) , lexer, state); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, yylsp, Rule, lexer, state); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into YYRESULT an error message about the unexpected token + YYCHAR while in state YYSTATE. Return the number of bytes copied, + including the terminating null byte. If YYRESULT is null, do not + copy anything; just return the number of bytes that would be + copied. As a special case, return 0 if an ordinary "syntax error" + message will do. Return YYSIZE_MAXIMUM if overflow occurs during + size calculation. */ +static YYSIZE_T +yysyntax_error (char *yyresult, int yystate, int yychar) +{ + int yyn = yypact[yystate]; + + if (! (YYPACT_NINF < yyn && yyn <= YYLAST)) + return 0; + else + { + int yytype = YYTRANSLATE (yychar); + YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]); + YYSIZE_T yysize = yysize0; + YYSIZE_T yysize1; + int yysize_overflow = 0; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + int yyx; + +# if 0 + /* This is so xgettext sees the translatable formats that are + constructed on the fly. */ + YY_("syntax error, unexpected %s"); + YY_("syntax error, unexpected %s, expecting %s"); + YY_("syntax error, unexpected %s, expecting %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s"); + YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s"); +# endif + char *yyfmt; + char const *yyf; + static char const yyunexpected[] = "syntax error, unexpected %s"; + static char const yyexpecting[] = ", expecting %s"; + static char const yyor[] = " or %s"; + char yyformat[sizeof yyunexpected + + sizeof yyexpecting - 1 + + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2) + * (sizeof yyor - 1))]; + char const *yyprefix = yyexpecting; + + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yycount = 1; + + yyarg[0] = yytname[yytype]; + yyfmt = yystpcpy (yyformat, yyunexpected); + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + yyformat[sizeof yyunexpected - 1] = '\0'; + break; + } + yyarg[yycount++] = yytname[yyx]; + yysize1 = yysize + yytnamerr (0, yytname[yyx]); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + yyfmt = yystpcpy (yyfmt, yyprefix); + yyprefix = yyor; + } + + yyf = YY_(yyformat); + yysize1 = yysize + yystrlen (yyf); + yysize_overflow |= (yysize1 < yysize); + yysize = yysize1; + + if (yysize_overflow) + return YYSIZE_MAXIMUM; + + if (yyresult) + { + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + char *yyp = yyresult; + int yyi = 0; + while ((*yyp = *yyf) != '\0') + { + if (*yyp == '%' && yyf[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyf += 2; + } + else + { + yyp++; + yyf++; + } + } + } + return yysize; + } +} +#endif /* YYERROR_VERBOSE */ + + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep, YYLTYPE *yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state) +#else +static void +yydestruct (yymsg, yytype, yyvaluep, yylocationp, lexer, state) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; + YYLTYPE *yylocationp; + parser::PTXLexer& lexer; + parser::PTXParser& state; +#endif +{ + YYUSE (yyvaluep); + YYUSE (yylocationp); + YYUSE (lexer); + YYUSE (state); + + if (!yymsg) + yymsg = "Deleting"; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + +/* Prevent warnings from -Wmissing-prototypes. */ +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int yyparse (void *YYPARSE_PARAM); +#else +int yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int yyparse (parser::PTXLexer& lexer, parser::PTXParser& state); +#else +int yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + + + + + +/*-------------------------. +| yyparse or yypush_parse. | +`-------------------------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (parser::PTXLexer& lexer, parser::PTXParser& state) +#else +int +yyparse (lexer, state) + parser::PTXLexer& lexer; + parser::PTXParser& state; +#endif +#endif +{ +/* The lookahead symbol. */ +int yychar; + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval; + +/* Location data for the lookahead symbol. */ +YYLTYPE yylloc; + + /* Number of syntax errors so far. */ + int yynerrs; + + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + `yyls': related to locations. + + Refer to the stacks thru separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + /* The location stack. */ + YYLTYPE yylsa[YYINITDEPTH]; + YYLTYPE *yyls; + YYLTYPE *yylsp; + + /* The locations where the error started and ended. */ + YYLTYPE yyerror_range[2]; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + YYLTYPE yyloc; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N), yylsp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yytoken = 0; + yyss = yyssa; + yyvs = yyvsa; + yyls = yylsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + + /* Initialize stack pointers. + Waste one element of value and location stack + so that they stay on the same level as the state stack. + The wasted elements are never initialized. */ + yyssp = yyss; + yyvsp = yyvs; + yylsp = yyls; + +#if YYLTYPE_IS_TRIVIAL + /* Initialize the default location before parsing starts. */ + yylloc.first_line = yylloc.last_line = 1; + yylloc.first_column = yylloc.last_column = 1; +#endif + + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + YYLTYPE *yyls1 = yyls; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yyls1, yysize * sizeof (*yylsp), + &yystacksize); + + yyls = yyls1; + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); + YYSTACK_RELOCATE (yyls_alloc, yyls); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + } +# endif +#endif /* no yyoverflow */ + + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + yylsp = yyls + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yyn == YYPACT_NINF) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yyn == 0 || yyn == YYTABLE_NINF) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + *++yyvsp = yylval; + *++yylsp = yylloc; + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + /* Default location. */ + YYLLOC_DEFAULT (yyloc, (yylsp - yylen), yylen); + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 3: + +/* Line 1455 of yacc.c */ +#line 75 "ptxgrammar.yy" + { + std::cerr << "Done reading PTX \n" << std::endl; + state.printHeader(); +;} + break; + + case 4: + +/* Line 1455 of yacc.c */ +#line 81 "ptxgrammar.yy" + { std::cerr << "Reading PTX version " << (yyvsp[(2) - (2)].fvalue) << std::endl; ;} + break; + + case 5: + +/* Line 1455 of yacc.c */ +#line 83 "ptxgrammar.yy" + { std::cerr << "Target " << (yyvsp[(2) - (2)].svalue) << std::endl; ;} + break; + + case 6: + +/* Line 1455 of yacc.c */ +#line 85 "ptxgrammar.yy" + { std::cerr << "Address_Size " << (yyvsp[(2) - (2)].ivalue) << std::endl; ;} + break; + + case 21: + +/* Line 1455 of yacc.c */ +#line 94 "ptxgrammar.yy" + { state.dataTypeId((yyvsp[(1) - (1)].ivalue)); ;} + break; + + case 44: + +/* Line 1455 of yacc.c */ +#line 117 "ptxgrammar.yy" + { (yyval.ivalue) = (yyvsp[(2) - (3)].ivalue); state.arrayDimensions((yyvsp[(2) - (3)].ivalue)); ;} + break; + + case 47: + +/* Line 1455 of yacc.c */ +#line 123 "ptxgrammar.yy" + { strcpy((yyval.svalue), (yyvsp[(1) - (1)].svalue)); state.identifier((yyvsp[(1) - (1)].svalue)); ;} + break; + + case 49: + +/* Line 1455 of yacc.c */ +#line 126 "ptxgrammar.yy" + {(yyval.ivalue) = (yyvsp[(2) - (2)].ivalue); state.alignment((yyvsp[(2) - (2)].ivalue));;} + break; + + case 50: + +/* Line 1455 of yacc.c */ +#line 127 "ptxgrammar.yy" + { state.alignment(0); ;} + break; + + case 52: + +/* Line 1455 of yacc.c */ +#line 131 "ptxgrammar.yy" + { + state.argumentDeclaration((yylsp[(1) - (4)])); +;} + break; + + case 53: + +/* Line 1455 of yacc.c */ +#line 136 "ptxgrammar.yy" + { state.argumentListBegin((yylsp[(1) - (1)])); ;} + break; + + case 54: + +/* Line 1455 of yacc.c */ +#line 137 "ptxgrammar.yy" + {state.argumentListEnd((yylsp[(1) - (1)])); ;} + break; + + case 59: + +/* Line 1455 of yacc.c */ +#line 144 "ptxgrammar.yy" + { + state.visibleEntryDeclaration((yyvsp[(3) - (4)].svalue), (yylsp[(1) - (4)])); +;} + break; + + case 60: + +/* Line 1455 of yacc.c */ +#line 148 "ptxgrammar.yy" + { state.returnArgumentListBegin((yylsp[(1) - (1)])); ;} + break; + + case 61: + +/* Line 1455 of yacc.c */ +#line 149 "ptxgrammar.yy" + {state.returnArgumentListEnd((yylsp[(1) - (1)])); ;} + break; + + case 65: + +/* Line 1455 of yacc.c */ +#line 153 "ptxgrammar.yy" + { + state.visibleFunctionDeclaration((yyvsp[(4) - (5)].svalue), (yylsp[(1) - (5)])); +;} + break; + + case 66: + +/* Line 1455 of yacc.c */ +#line 159 "ptxgrammar.yy" + { + state.visibleInitializableDeclaration((yyvsp[(4) - (5)].svalue),(yylsp[(1) - (5)])); +;} + break; + + + +/* Line 1455 of yacc.c */ +#line 1664 "ptxgrammar.cc" + default: break; + } + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + *++yylsp = yyloc; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (&yylloc, lexer, state, YY_("syntax error")); +#else + { + YYSIZE_T yysize = yysyntax_error (0, yystate, yychar); + if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM) + { + YYSIZE_T yyalloc = 2 * yysize; + if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM)) + yyalloc = YYSTACK_ALLOC_MAXIMUM; + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yyalloc); + if (yymsg) + yymsg_alloc = yyalloc; + else + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + } + } + + if (0 < yysize && yysize <= yymsg_alloc) + { + (void) yysyntax_error (yymsg, yystate, yychar); + yyerror (&yylloc, lexer, state, yymsg); + } + else + { + yyerror (&yylloc, lexer, state, YY_("syntax error")); + if (yysize != 0) + goto yyexhaustedlab; + } + } +#endif + } + + yyerror_range[0] = yylloc; + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval, &yylloc, lexer, state); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + yyerror_range[0] = yylsp[1-yylen]; + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (yyn != YYPACT_NINF) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + yyerror_range[0] = *yylsp; + yydestruct ("Error: popping", + yystos[yystate], yyvsp, yylsp, lexer, state); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + *++yyvsp = yylval; + + yyerror_range[1] = yylloc; + /* Using YYLLOC is tempting, but would change the location of + the lookahead. YYLOC is available though. */ + YYLLOC_DEFAULT (yyloc, (yyerror_range - 1), 2); + *++yylsp = yyloc; + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined(yyoverflow) || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (&yylloc, lexer, state, YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval, &yylloc, lexer, state); + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp, yylsp, lexer, state); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + + +/* Line 1675 of yacc.c */ +#line 164 "ptxgrammar.yy" + + +int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state ) +{ + lexer.yylval = token; + + int tokenValue = lexer.yylexPosition(); + location->first_line = lexer.lineno(); + location->first_column = lexer.column; + +#if 0 + report( " Lexer (" << location->first_line << "," + << location->first_column + << "): " << parser::PTXLexer::toString( tokenValue ) << " \"" + << lexer.YYText() << "\""); +#endif + + return tokenValue; +} + +static std::string toString( YYLTYPE& location, parser::PTXParser& state ) +{ + std::stringstream stream; + stream +#if 0 + << state.fileName +#else + << "ptx " +#endif + << " (" << location.first_line << ", " + << location.first_column << "): "; + return stream.str(); +} + +void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state, char const* message ) +{ + std::stringstream stream; + stream << toString( *location, state ) + << " " << message; + fprintf(stderr, "--ERROR-- %s %s \n", toString(*location, state).c_str(), message); + assert(0); +} + +} + diff --git a/ptxtools/ptxgrammar.yy b/ptxtools/ptxgrammar.yy new file mode 100644 index 00000000..482340af --- /dev/null +++ b/ptxtools/ptxgrammar.yy @@ -0,0 +1,215 @@ +%locations + +%{ + #include + #include "PTXParser.h" + #include "PTXLexer.h" + #include + #include + #include + #include + + #define YYERROR_VERBOSE 1 + + #ifdef REPORT_BASE + #undef REPORT_BASE + #endif + + #define REPORT_BASE 0 + + namespace ptx + { + + int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state ); + void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state, char const* message ); + + std::string yyTypeToString( int ); + +%} + +%union +{ + char svalue[1024]; + double fvalue; + int ivalue; + unsigned int uvalue; +} + +%parse-param {parser::PTXLexer& lexer} +%parse-param {parser::PTXParser& state} +%lex-param {parser::PTXLexer& lexer} +%lex-param {parser::PTXParser& state} +%pure-parser + +// define the constant-string tokens: +%token TOKEN_VERSION TOKEN_TARGET TOKEN_ADDRESS_SIZE +%token TOKEN_VISIBLE TOKEN_FUNC TOKEN_ENTRY +%token TOKEN_PARAM TOKEN_ALIGN +%token TOKEN_GLOBAL +%token TOKEN_B8 TOKEN_B16 TOKEN_B32 TOKEN_B64 +%token TOKEN_U8 TOKEN_U16 TOKEN_U32 TOKEN_U64 +%token TOKEN_S8 TOKEN_S16 TOKEN_S32 TOKEN_S64 +%token TOKEN_F32 TOKEN_F64 + +// define the "terminal symbol" token types I'm going to use (in CAPS +// by convention), and associate each with a field of the union: +%token TOKEN_INT +%token TOKEN_FLOAT +%token TOKEN_STRING + +%type identifier +%type arrayDimensionSet +%type alignment + +%start ptxsource + +%% +// the first rule defined is the highest-level rule, which in our +// case is just the concept of a whole "snazzle file": +ptxsource: + header ptxbody; + +header: + version target address_size +{ +// std::cerr << "Done reading PTX \n" << std::endl; + state.printHeader(); +}; + +version: + TOKEN_VERSION TOKEN_FLOAT { assert($2 >= 3.0); } ;//std::cerr << "Reading PTX version " << $2 << std::endl; }; +target: + TOKEN_TARGET TOKEN_STRING { assert(std::string($2) == std::string("sm_35")); } //std::cerr << "Target " << $2 << std::endl; }; +address_size: + TOKEN_ADDRESS_SIZE TOKEN_INT { assert($2 == 64); } //std::cerr << "Address_Size " << $2 << std::endl; }; + + +dataTypeId : + TOKEN_U8 | TOKEN_U16 | TOKEN_U32 | TOKEN_U64 + | TOKEN_S8 | TOKEN_S16 | TOKEN_S32 | TOKEN_S64 + | TOKEN_B8 | TOKEN_B16 | TOKEN_B32 | TOKEN_B64 + | TOKEN_F32 | TOKEN_F64; + +dataType: dataTypeId { state.dataTypeId($1); } + +anytoken: + TOKEN_ALIGN +| TOKEN_PARAM +| dataTypeId +| TOKEN_STRING | TOKEN_FLOAT | TOKEN_INT +| TOKEN_FUNC | TOKEN_ENTRY +| TOKEN_GLOBAL +| '[' +| ']' +| '(' +| ')' +| ',' +| ';' +| '=' +; + +ptxbody: + ptxbody visibleFunctionDeclaration | visibleFunctionDeclaration + | ptxbody visibleEntryDeclaration| visibleEntryDeclaration + | ptxbody visibleInitializableDeclaration| visibleInitializableDeclaration + | ptxbody anytoken | anytoken; + + + +arrayDimensionSet : '[' TOKEN_INT ']' { $$ = $2; state.arrayDimensions($2); } +// arrayDimensionSet : arrayDimensionSet '[' TOKEN_INT ']' { $$ = $2; } +// arrayDimensionSet : '[' ']' { $$ = 0; } +arrayDimensions : /* empty string */; +arrayDimensions : arrayDimensionSet; + +identifier: TOKEN_STRING { strcpy($$, $1); state.identifier($1); } +parameter : TOKEN_PARAM; + +alignment : TOKEN_ALIGN TOKEN_INT {$$ = $2; state.alignment($2);} +addressableVariablePrefix : dataType { state.alignment(0); } +addressableVariablePrefix : alignment dataType; + +argumentDeclaration : parameter addressableVariablePrefix identifier arrayDimensions +{ + state.argumentDeclaration(@1); +} + + +argumentListBegin : '(' { state.argumentListBegin(@1); }; +argumentListEnd : ')' {state.argumentListEnd(@1); }; +argumentListBody : argumentDeclaration; +argumentListBody : /* empty string */; +argumentListBody : argumentListBody ',' argumentDeclaration; +argumentList: argumentListBegin argumentListBody argumentListEnd; + +visibleEntryDeclaration: TOKEN_VISIBLE TOKEN_ENTRY identifier argumentList +{ + state.visibleEntryDeclaration($3, @1); +}; + +returnArgumentListBegin : '(' { state.returnArgumentListBegin(@1); } +returnArgumentListEnd : ')' {state.returnArgumentListEnd(@1); } +returnArgumentList : returnArgumentListBegin argumentListBody returnArgumentListEnd; +optionalReturnArgumentList : returnArgumentList | /* empty string */; +visibleFunctionDeclaration: TOKEN_VISIBLE TOKEN_FUNC optionalReturnArgumentList identifier argumentList +{ + state.visibleFunctionDeclaration($4, @1); +}; + +visibleInitializableDeclaration : + TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier arrayDimensionSet + { state.visibleInitializableDeclaration($4,@1); } +| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier ';' + {state.arrayDimensions(0); state.visibleInitializableDeclaration($4,@1); } +| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier '=' + {state.arrayDimensions(0); state.visibleInitializableDeclaration($4,@1); } + + +%% + +int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state ) +{ + lexer.yylval = token; + + int tokenValue = lexer.yylexPosition(); + location->first_line = lexer.lineno(); + location->first_column = lexer.column; + +#if 0 + report( " Lexer (" << location->first_line << "," + << location->first_column + << "): " << parser::PTXLexer::toString( tokenValue ) << " \"" + << lexer.YYText() << "\""); +#endif + + return tokenValue; +} + +static std::string toString( YYLTYPE& location, parser::PTXParser& state ) +{ + std::stringstream stream; + stream +#if 0 + << state.fileName +#else + << "ptx " +#endif + << " (" << location.first_line << ", " + << location.first_column << "): "; + return stream.str(); +} + +void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, + parser::PTXParser& state, char const* message ) +{ + std::stringstream stream; + stream << toString( *location, state ) + << " " << message; + fprintf(stderr, "--Parser ERROR-- %s %s \n", toString(*location, state).c_str(), message); + exit(-1); +} + +} diff --git a/run_tests.py b/run_tests.py index 671ad416..081c85b5 100755 --- a/run_tests.py +++ b/run_tests.py @@ -258,7 +258,7 @@ def run_test(testname): if should_fail: cc_cmd += " -DEXPECT_FAILURE" if is_nvptx_target: - nvptxcc_exe = "nvptxcc" + nvptxcc_exe = "ptxtools/alloy_ptxcc.sh" nvptxcc_exe_rel = add_prefix(nvptxcc_exe) cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \ (nvptxcc_exe_rel, obj_name, match, exe_name) @@ -270,7 +270,7 @@ def run_test(testname): if is_generic_target: ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file) if is_nvptx_target: - filename4ptx = filename+".ptx.parsed_ispc" + filename4ptx = "/tmp/"+os.path.basename(filename)+".parsed.ispc" grep_cmd = "grep -v 'export uniform int width' %s > %s " % \ (filename, filename4ptx) if options.verbose: