diff --git a/builtins/target-nvptx.ll b/builtins/target-nvptx.ll
index 43a16987..1901ba5e 100644
--- a/builtins/target-nvptx.ll
+++ b/builtins/target-nvptx.ll
@@ -705,6 +705,19 @@ define  float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline
   ret float %ret
 }
 
+define  double @__rsqrt_uniform_double(double) nounwind readonly alwaysinline 
+{
+  %ret1 = call double @llvm.sqrt.f64(double %0)
+  %ret  = fdiv double 1., %ret1
+  ret double %ret
+}
+define  double @__rcp_uniform_double(double) nounwind readonly alwaysinline 
+{
+  %ret  = fdiv double 1., %0
+  ret double %ret
+}
+
+
 define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone  alwaysinline
 {
   %v = extractelement <1 x float> %0, i32 0
@@ -712,6 +725,13 @@ define <WIDTH x float> @__rcp_varying_float(<WIDTH x float>) nounwind readnone
   %rv = insertelement <1 x float> undef, float %r, i32 0 
   ret <WIDTH x float> %rv
 }
+define <WIDTH x double> @__rcp_varying_double(<WIDTH x double>) nounwind readnone  alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__rcp_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
 define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
 {
   %v = extractelement <1 x float> %0, i32 0
@@ -719,6 +739,14 @@ define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float>) nounwind readnone
   %rv = insertelement <1 x float> undef, float %r, i32 0 
   ret <WIDTH x float> %rv
 }
+define <WIDTH x double> @__rsqrt_varying_double(<WIDTH x double>) nounwind readnone alwaysinline
+{
+  %v = extractelement <1 x double> %0, i32 0
+  %r = call double @__rsqrt_uniform_double(double %v)
+  %rv = insertelement <1 x double> undef, double %r, i32 0 
+  ret <WIDTH x double> %rv
+}
+
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone alwaysinline
 {
   %v = extractelement <1 x float> %0, i32 0
@@ -726,7 +754,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   %rv = insertelement <1 x float> undef, float %r, i32 0 
   ret <WIDTH x float> %rv
 }
-
 ;; declare double @__sqrt_uniform_double(double) nounwind readnone
 define  double @__sqrt_uniform_double(double) nounwind readonly alwaysinline {
   %ret = call double @llvm.sqrt.f64(double %0)
diff --git a/examples/cuda_helpers.cuh b/examples/cuda_helpers.cuh
new file mode 100644
index 00000000..2c6ec6dc
--- /dev/null
+++ b/examples/cuda_helpers.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+#define programCount 32
+#define programIndex (threadIdx.x & 31)
+#define taskIndex0 (blockIdx.x*4 + (threadIdx.x >> 5))
+#define taskCount0 (gridDim.x*4)
+#define taskIndex1 (blockIdx.y)
+#define taskCount1 (gridDim.y)
+#define taskIndex2 (blockIdx.z)
+#define taskCount2 (gridDim.z)
+#define taskIndex (taskIndex0 + taskCount0*(taskIndex1 + taskCount1*taskIndex2))
+#define taskCount (taskCount0*taskCount1*taskCount2)
+#define warpIdx (threadIdx.x >> 5)
+#define launch(ntx,nty,ntz,func) if (programIndex==0) func<<<dim3(((ntx)+4-1)/4,nty,ntz),128>>>
+#define sync cudaDeviceSynchronize()
+#define cif if
+__device__ __forceinline__ static double __shfl(double x, int lane)
+{
+  return __hiloint2double(
+      __shfl_xor(__double2hiint(x), lane),
+      __shfl_xor(__double2loint(x), lane));
+
+}
+#define shuffle(x,y) __shfl(x,y)
+#define broadcast(x,y) __shfl(x,y)
diff --git a/examples/ispc_malloc.cpp b/examples/ispc_malloc.cpp
new file mode 100644
index 00000000..dcbe5d48
--- /dev/null
+++ b/examples/ispc_malloc.cpp
@@ -0,0 +1,54 @@
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <cstring>
+#include "ispc_malloc.h"
+
+#ifdef _CUDA_
+
+void * operator new(size_t size) throw(std::bad_alloc)
+{
+  void *ptr;
+  ispc_malloc(&ptr, size);
+  return ptr;
+}
+void operator delete(void *ptr) throw()
+{
+  ispc_free(ptr);
+}
+
+#else
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  *ptr = malloc(size);
+}
+void ispc_free(void *ptr)
+{
+  free(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  memset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+}
+void ispcSetStackLimit(size_t value)
+{
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  return -1;
+}
+unsigned long long ispcGetStackLimit()
+{
+  return -1;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  memcpy(dest, src, num);
+  return dest;
+}
+
+#endif
diff --git a/examples/ispc_malloc.h b/examples/ispc_malloc.h
new file mode 100644
index 00000000..1d63f602
--- /dev/null
+++ b/examples/ispc_malloc.h
@@ -0,0 +1,10 @@
+#pragma once
+
+extern void ispc_malloc(void **ptr, const size_t size);
+extern void ispc_free(void *ptr);
+extern void ispc_memset(void *ptr, int value, size_t size);
+extern void ispcSetMallocHeapLimit(size_t value);
+extern void ispcSetStackLimit(size_t value);
+extern unsigned long long ispcGetMallocHeapLimit();
+extern unsigned long long ispcGetStackLimit();
+extern void * ispcMemcpy(void *dest,  void *src,  size_t num);
diff --git a/examples/nvcc_helpers.cu b/examples/nvcc_helpers.cu
new file mode 100644
index 00000000..e6faea96
--- /dev/null
+++ b/examples/nvcc_helpers.cu
@@ -0,0 +1,43 @@
+#ifndef _CUDA_
+#error "Something went wrong..."
+#endif
+
+void ispc_malloc(void **ptr, const size_t size)
+{
+  cudaMallocManaged(ptr, size);
+}
+void ispc_free(void *ptr)
+{
+  cudaFree(ptr);
+}
+void ispc_memset(void *ptr, int value, size_t size)
+{
+  cudaMemset(ptr, value, size);
+}
+void ispcSetMallocHeapLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitMallocHeapSize,value);
+}
+void ispcSetStackLimit(size_t value)
+{
+  cudaDeviceSetLimit(cudaLimitStackSize,value);
+}
+unsigned long long ispcGetMallocHeapLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize);
+  return value;
+}
+unsigned long long ispcGetStackLimit()
+{
+  size_t value;
+  cudaDeviceGetLimit(&value, cudaLimitStackSize);
+  return value;
+}
+void * ispcMemcpy(void *dest,  void *src,  size_t num)
+{
+  cudaMemcpy(dest, src, num, cudaMemcpyDefault);
+  return dest;
+}
+
+
diff --git a/nvptxcc b/nvptxcc
deleted file mode 100755
index 81d622e9..00000000
--- a/nvptxcc
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-
-PATH=$ISPC_HOME/examples_ptx/ptxcc:$ISPC_HOME/examples_ptx/ptxgen:$PATH
-PTXCC=ptxcc
-ARGS=${@:2}
-if [ "$NVVM" == "1" ];
-then
-  LLVM32=$HOME/usr/local/llvm/bin-3.2
-  LLVMDIS=$LLVM32/bin/llvm-dis
-  PTXGEN=$ISPC_HOME/examples_ptx/ptxgen/ptxgen
-  $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll > $1.ptx) && \
-  $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
-  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
-else
-  $($PTXCC $1 -o $1.o -Xnvcc="-G") && \
-  $(nvcc test_static_nvptx.cpp examples_ptx/nvcc_helpers.cu examples_ptx/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples_ptx/ -D_CUDA_ -lcudadevrt $ARGS);
-fi
-
-
-
diff --git a/ptxtools/Makefile b/ptxtools/Makefile
new file mode 100644
index 00000000..66a108c6
--- /dev/null
+++ b/ptxtools/Makefile
@@ -0,0 +1,49 @@
+all: ptxcc ptxgen
+
+CXX=clang++
+CXXFLAGS += -O3
+CXXFLAGS += -I/opt/local/include
+
+LD=clang++
+LDFLAGS += -L/opt/local/lib
+
+FLEX=flex
+BISON=bison
+
+CUDATK=/usr/local/cuda
+LIBDEVICE_MAJOR=1
+LIBDEVICE_MINOR=0
+
+ptxgrammar.cc : ptxgrammar.yy
+	$(BISON) -d -v -t  ptxgrammar.yy -o ptxgrammar.cc
+
+ptx.cc: ptx.ll ptxgrammar.cc
+	$(FLEX) -t ptx.ll > ptx.cc
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@ 
+
+%.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@ 
+
+OBJ= ptxcc.o \
+     ptx.o \
+		 ptxgrammar.o
+
+ptxcc: $(OBJ)
+	$(LD) $(LDFLAGS) $^ -o $@ 
+
+ptxgen: ptxgen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<  \
+	  -L$(CUDATK)/nvvm/lib64 -lnvvm \
+		-I$(CUDATK)/nvvm/include \
+		-I$(CUDATK)/include   \
+		-DLIBDEVICE_MAJOR_VERSION=$(LIBDEVICE_MAJOR) \
+		-DLIBDEVICE_MINOR_VERSION=$(LIBDEVICE_MINOR) \
+		-DLIBNVVM_HOME=$(CUDATK)/nvvm -Wl,-rpath,$(CUDATK)/nvvm/lib64
+
+clean: 
+	/bin/rm -f ptxgen ptxcc $(OBJ) ptxgrammar.hh ptxgrammar.cc ptx.cc ptxgrammar.output
+
+$(OBJ):  ptxgrammar.cc ptx.cc PTXParser.h PTXLexer.h
+
diff --git a/ptxtools/PTXLexer.h b/ptxtools/PTXLexer.h
new file mode 100644
index 00000000..1136fe4d
--- /dev/null
+++ b/ptxtools/PTXLexer.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <cstring>
+#include <cassert>
+
+namespace parser
+{
+  class PTXLexer;
+  class PTXParser;
+}
+
+#include "ptxgrammar.hh"
+
+namespace parser
+{
+	/*!	\brief A wrapper around yyFlexLexer to allow for a local variable */
+	class PTXLexer : public ptxFlexLexer
+	{
+		public:
+			YYSTYPE*     yylval;
+			int          column;
+			int          nextColumn;
+
+		public:
+			PTXLexer( std::istream* arg_yyin, 
+				std::ostream* arg_yyout ) :
+        yyFlexLexer( arg_yyin, arg_yyout ), yylval( 0 ), column( 0 ), 
+        nextColumn( 0 ) { }
+	
+			int yylex();
+      int yylexPosition()
+      {
+        int token = yylex();
+        column = nextColumn;
+        nextColumn = column + strlen( YYText() );
+        return token;
+      }
+
+  };
+}
diff --git a/ptxtools/PTXParser.h b/ptxtools/PTXParser.h
new file mode 100644
index 00000000..d221d739
--- /dev/null
+++ b/ptxtools/PTXParser.h
@@ -0,0 +1,254 @@
+#pragma once
+
+#undef yyFlexLexer
+#define yyFlexLexer ptxFlexLexer
+#include <FlexLexer.h>
+
+
+#include "PTXLexer.h"
+
+#include <vector>
+#include <sstream>
+#include <string>
+namespace ptx
+{ 
+	extern int yyparse( parser::PTXLexer&, parser::PTXParser& );
+}
+
+namespace parser
+{
+  /*! \brief An implementation of the Parser interface for PTX */
+  class PTXParser 
+  {
+    private:
+    typedef int token_t;
+    std::ostream &out;
+    std::string _identifier;
+    token_t _dataTypeId;
+    int _alignment;
+
+    bool isArgumentList, isReturnArgumentList;
+    struct  argument_t
+    {
+      token_t type;
+      std::string name;
+      int dim;
+
+      argument_t(const token_t _type, const std::string &_name, const int _dim = 1) :
+        type(_type), name(_name), dim(_dim) {}
+    };
+    std::vector<argument_t> argumentList, returnArgumentList;
+    std::vector<int> arrayDimensionsList;
+
+    public:
+    PTXParser(std::ostream &_out) : out(_out)
+    {
+      isArgumentList = isReturnArgumentList = false;
+      _alignment = 1;
+    }
+
+    void printHeader()
+    {
+      std::stringstream s;
+#if 0
+      s << "template<int N> struct __align__(N)   b8_t  { unsigned char  _v[N]; __device__ b8_t()  {}; __device__ b8_t (const int value) {}}; \n";
+      s << "template<int N> struct __align__(2*N) b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n";
+#else
+      s << "template<int N> struct b8_t  { unsigned char  _v[N]; __device__ b8_t()  {}; __device__ b8_t (const int value) {}}; \n";
+      s << "template<int N> struct b16_t { unsigned short _v[N]; __device__ b16_t() {}; __device__ b16_t(const int value) {}}; \n";
+#endif
+      s << "struct b8d_t  { unsigned char  _v[1]; }; \n";
+      s << "struct b16d_t { unsigned short _v[1]; }; \n";
+
+      s << "typedef unsigned int       b32_t; \n";
+      s << "typedef unsigned int       u32_t; \n";
+      s << "typedef int                s32_t; \n";
+
+      s << "typedef unsigned long long b64_t; \n";
+      s << "typedef unsigned long long u64_t; \n";
+      s << "typedef long long          s64_t; \n";
+
+      s << "typedef float              f32_t; \n";
+      s << "typedef double             f64_t; \n";
+      s << " \n";
+      out << s.str();
+    }
+
+#define LOC YYLTYPE& location
+
+    void identifier(const std::string &s) { _identifier = s;     }
+    void dataTypeId(const token_t token)  { _dataTypeId = token; }
+    void argumentListBegin(LOC) { isArgumentList = true;  }
+    void argumentListEnd  (LOC) { isArgumentList = false; }
+    void returnArgumentListBegin(LOC) { isReturnArgumentList = true;  }
+    void returnArgumentListEnd  (LOC) { isReturnArgumentList = false; }
+    void argumentDeclaration(LOC) 
+    {
+      assert(arrayDimensionsList.size() <= 1);
+      const int dim = arrayDimensionsList.empty() ? 1 : arrayDimensionsList[0];
+      const argument_t arg(_dataTypeId, _identifier, dim);
+      if (isArgumentList)
+        argumentList.push_back(arg);
+      else if (isReturnArgumentList)
+        returnArgumentList.push_back(arg);
+      else
+        assert(0);
+      arrayDimensionsList.clear();
+    }
+    void alignment(const int value) { _alignment = value; }
+
+    void arrayDimensions(const int value)
+    {
+      arrayDimensionsList.push_back(value);
+    }
+
+    std::string printArgument(const argument_t arg, const bool printDataType = true)
+    {
+      std::stringstream s;
+      if (printDataType) 
+        s << tokenToDataType(arg.type, arg.dim) << " ";
+      s << arg.name << " ";
+      return s.str();
+    }
+
+    std::string printArgumentList(const bool printDataType = true)
+    {
+      std::stringstream s;
+      if (argumentList.empty()) return s.str();
+      const int n = argumentList.size();
+      s << " " << printArgument(argumentList[0], printDataType);
+      for (int i = 1; i < n; i++)
+        s << ",\n " <<  printArgument(argumentList[i], printDataType);
+      return s.str();
+    }
+
+    void visibleEntryDeclaration(const std::string &calleeName, LOC) 
+    {
+      std::stringstream s;
+      assert(returnArgumentList.empty());
+      s << "extern \"C\" \n";
+      s << "__global__ void " << calleeName << " (\n";
+      s << printArgumentList();
+      s << "\n ) { asm(\" // entry \"); }\n";
+
+     
+      /* check if this is an "export"  entry */
+      const int entryNameLength = calleeName.length();
+      const int hostNameLength = std::max(entryNameLength-9,0);
+      const std::string ___export(&calleeName.c_str()[hostNameLength]);
+      if (___export.compare("___export") == 0)
+      {
+        std::string hostCalleeName;
+        hostCalleeName.append(calleeName.c_str(), hostNameLength);
+        s << "/*** host interface ***/\n";
+        s << "extern \"C\" \n";
+        s << "__host__ void " << hostCalleeName << " (\n";
+        s << printArgumentList();
+        s << "\n )\n";
+        s << "{\n   ";
+//        s << " cudaFuncSetCacheConfig (" << calleeName << ", ";
+        s << " cudaDeviceSetCacheConfig (";
+#if 1
+        s << " cudaFuncCachePreferEqual ";
+#elif 1
+        s << " cudaFuncCachePreferL1 ";
+#else
+        s << " cudaFuncCachePreferShared ";
+#endif
+        s << ");\n";
+        s << calleeName;
+        s << "<<<1,32>>>(\n";
+        s << printArgumentList(false);
+        s << ");\n";
+        s << " cudaDeviceSynchronize(); \n";
+        s << "}\n";
+      }
+      s << "\n";
+      argumentList.clear();
+
+      out << s.str();
+    }
+    
+    void visibleFunctionDeclaration(const std::string &calleeName, LOC) 
+    {
+      std::stringstream s;
+      assert(returnArgumentList.size() < 2);
+      s << "extern \"C\" \n";
+      s << "__device__ ";
+      if (returnArgumentList.empty())
+        s << " void ";
+      else
+        s << " " <<  tokenToDataType(returnArgumentList[0].type, returnArgumentList[0].dim);
+      s << calleeName << " (\n";
+      s << printArgumentList();
+
+      if (returnArgumentList.empty())
+        s << "\n ) { asm(\" // function \"); }\n\n";
+      else
+      {
+        s << "\n ) { asm(\" // function \"); return 0;} /* return value to disable warnings */\n\n";
+//        s << "\n ) { asm(\" // function \"); } /* this will generate warrning */\n\n";
+      }
+
+      argumentList.clear();
+      returnArgumentList.clear();
+
+      out << s.str();
+    }
+
+    void visibleInitializableDeclaration(const std::string &name, LOC)
+    {
+      assert(arrayDimensionsList.size() == 1);
+      std::stringstream s;
+      s << "extern \"C\" __device__ ";
+      if (_alignment > 0)
+        s << "__attribute__((aligned(" << _alignment << "))) ";
+      s << tokenToDataType(_dataTypeId, 0);
+      if (arrayDimensionsList[0] == 0)
+        s << name << ";\n\n";
+      else
+        s << name << "[" << arrayDimensionsList[0] << "] = {0};\n\n";
+      out << s.str();
+      arrayDimensionsList.clear();
+    }
+
+#undef LOC
+
+    std::string tokenToDataType( token_t token , int dim)
+    {
+      std::stringstream s;
+      switch( token )
+      {
+        case TOKEN_B8:  
+          if (dim > 0)   s << "b8_t<"<<dim<<"> "; 
+          else           s << "b8d_t ";
+          break;
+        case TOKEN_U8:  assert(0); s << "u8_t "; break;
+        case TOKEN_S8:  assert(0); s << "s8_t "; break;
+                         
+        case TOKEN_B16: 
+          if (dim > 0)   s << "b16_t<"<<dim<<"> "; 
+          else           s << "b16d_t ";
+          break;
+        case TOKEN_U16: assert(0); s << "u16_t "; break;
+        case TOKEN_S16: assert(0); s << "s16_t "; break;
+
+        case TOKEN_B32:  assert(dim <= 1); s << "b32_t "; break;
+        case TOKEN_U32:  assert(dim <= 1); s << "u32_t "; break;
+        case TOKEN_S32:  assert(dim <= 1); s << "s32_t "; break;
+
+        case TOKEN_B64:  assert(dim <= 1); s << "b64_t "; break;
+        case TOKEN_U64:  assert(dim <= 1); s << "u64_t "; break;
+        case TOKEN_S64:  assert(dim <= 1); s << "s64_t "; break;
+
+        case TOKEN_F32:  assert(dim <= 1); s << "f32_t "; break;
+        case TOKEN_F64:  assert(dim <= 1); s << "f64_t "; break;
+        default: std::cerr << "token= " << token<< std::endl; assert(0);
+      }
+
+      return s.str();
+    }
+  };
+}
+
+
diff --git a/ptxtools/alloy_ptxcc.sh b/ptxtools/alloy_ptxcc.sh
new file mode 100755
index 00000000..d98f8658
--- /dev/null
+++ b/ptxtools/alloy_ptxcc.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+PTXCC=$ISPC_HOME/ptxtools/ptxcc
+PTXGEN=$ISPC_HOME/ptxtools/ptxgen
+ARGS=${@:2}
+if [ "$NVVM" == "1" ];
+then
+  LLVM32=$HOME/usr/local/llvm/bin-3.2
+  LLVMDIS=$LLVM32/bin/llvm-dis
+  $($LLVMDIS $1 -o $1.ll) && $($PTXGEN $1.ll -o $1.ptx) && \
+  $($PTXCC $1.ptx -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples/nvcc_helpers.cu examples/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples/ -D_CUDA_ -lcudadevrt $ARGS);
+else
+  $($PTXCC $1 -o $1.o -Xnvcc="-G") && \
+  $(nvcc test_static_nvptx.cpp examples/nvcc_helpers.cu examples/ispc_malloc.cpp $1.o -arch=sm_35 -Iexamples/ -D_CUDA_ -lcudadevrt $ARGS);
+fi
+
+
+
diff --git a/ptxtools/main.cpp b/ptxtools/main.cpp
new file mode 100644
index 00000000..f5de981d
--- /dev/null
+++ b/ptxtools/main.cpp
@@ -0,0 +1,1458 @@
+/*
+
+Bonsai V2: A parallel GPU N-body gravitational Tree-code
+
+(c) 2010-2012:
+Jeroen Bedorf
+Evghenii Gaburov
+Simon Portegies Zwart
+
+Leiden Observatory, Leiden University
+
+http://castle.strw.leidenuniv.nl
+http://github.com/treecode/Bonsai
+
+*/
+
+#ifdef WIN32
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <Windows.h>
+#include <process.h>
+#define M_PI        3.14159265358979323846264338328
+
+#include <stdlib.h>
+#include <time.h>
+void srand48(const long seed)
+{
+  srand(seed);
+}
+//JB This is not a proper work around but just to get things compiled...
+double drand48()
+{
+  return double(rand())/RAND_MAX;
+}
+
+
+#endif
+
+
+#ifdef USE_MPI
+  #include <omp.h>
+  #include <mpi.h>
+#endif
+
+#include <iostream>
+#include <stdlib.h>
+#include <vector>
+#include <fstream>
+#include <sstream>
+#include "log.h"
+#include "anyoption.h"
+#include "renderloop.h"
+#include "plummer.h"
+#include "disk_shuffle.h"
+#ifdef GALACTICS
+#include "galactics.h"
+#endif
+
+
+#if ENABLE_LOG
+  bool ENABLE_RUNTIME_LOG;
+  bool PREPEND_RANK;
+  int  PREPEND_RANK_PROCID;
+  int  PREPEND_RANK_NPROCS;
+#endif
+
+using namespace std;
+
+#include "../profiling/bonsai_timing.h"
+
+int devID;
+int renderDevID;
+
+extern void initTimers()
+{
+#ifndef CUXTIMER_DISABLE
+  // Set up the profiling timing info
+  build_tree_init();
+  compute_propertiesD_init();
+  dev_approximate_gravity_init();
+  parallel_init();
+  sortKernels_init();
+  timestep_init();
+#endif
+}
+
+extern void displayTimers()
+{
+#ifndef CUXTIMER_DISABLE
+  // Display all timing info on the way out
+  build_tree_display();
+  compute_propertiesD_display();
+  //dev_approximate_gravity_display();
+  //parallel_display();
+  //sortKernels_display();
+  //timestep_display();
+#endif
+}
+
+#include "octree.h"
+
+#ifdef USE_OPENGL
+#include "renderloop.h"
+#include <cuda_gl_interop.h>
+#endif
+
+void read_dumbp_file_parallel(vector<real4> &bodyPositions, vector<real4> &bodyVelocities,  vector<int> &bodiesIDs,  float eps2,
+                     string fileName, int rank, int procs, int &NTotal2, int &NFirst, int &NSecond, int &NThird, octree *tree, int reduce_bodies_factor)  
+{
+  //Process 0 does the file reading and sends the data
+  //to the other processes
+  
+  //Now we have different types of files, try to determine which one is used
+  /*****
+  If individual softening is on there is only one option:
+  Header is formatted as follows: 
+  N     #       #       #
+  so read the first number and compute how particles should be distributed
+  
+  If individual softening is NOT enabled, i can be anything, but for ease I assume standard dumbp files:
+  no Header
+  ID mass x y z vx vy vz
+  now the next step is risky, we assume mass adds up to 1, so number of particles will be : 1 / mass
+  use this as initial particle distribution
+  
+  */
+  
+  
+  char fullFileName[256];
+  sprintf(fullFileName, "%s", fileName.c_str());
+
+  LOG("Trying to read file: %s \n", fullFileName);
+
+  ifstream inputFile(fullFileName, ios::in);
+
+  if(!inputFile.is_open())
+  {
+    LOG("Can't open input file \n");
+    exit(0);
+  }
+  
+  int NTotal;
+  int idummy;
+  real4 positions;
+  real4 velocity;
+
+  #ifndef INDSOFT
+     inputFile >> idummy >> positions.w;
+     inputFile.seekg(0, ios::beg); //Reset file pointer
+     NTotal = (int)(1 / positions.w);
+  #else
+     //Read the Ntotal from the file header
+     inputFile >> NTotal >> NFirst >> NSecond >> NThird;
+  #endif
+  
+  
+  
+  //Rough divide
+  uint perProc = NTotal / procs;
+  bodyPositions.reserve(perProc+10);
+  bodyVelocities.reserve(perProc+10);
+  bodiesIDs.reserve(perProc+10);
+  perProc -= 1;
+
+  //Start reading
+  int particleCount = 0;
+  int procCntr = 1;
+
+  int globalParticleCount = 0;
+
+  while(!inputFile.eof()) {
+    
+    inputFile >> idummy
+              >> positions.w >> positions.x >> positions.y >> positions.z
+              >> velocity.x >> velocity.y >> velocity.z;    
+
+	globalParticleCount++;
+
+	if( globalParticleCount % reduce_bodies_factor == 0 ) 
+		positions.w *= reduce_bodies_factor;
+
+	if( globalParticleCount % reduce_bodies_factor != 0 )
+		continue;
+
+    #ifndef INDSOFT
+      velocity.w = sqrt(eps2);
+    #else
+      inputFile >> velocity.w; //Read the softening from the input file
+    #endif
+    
+    bodyPositions.push_back(positions);
+    bodyVelocities.push_back(velocity);
+    
+    #ifndef INDSOFT    
+      idummy = particleCount;
+    #endif
+    
+    bodiesIDs.push_back(idummy);  
+    
+    particleCount++;
+  
+  
+    if(bodyPositions.size() > perProc && procCntr != procs)
+    {       
+      tree->ICSend(procCntr,  &bodyPositions[0], &bodyVelocities[0],  &bodiesIDs[0], (int)bodyPositions.size());
+      procCntr++;
+      
+      bodyPositions.clear();
+      bodyVelocities.clear();
+      bodiesIDs.clear();
+    }
+  }//end while
+  
+  inputFile.close();
+  
+  //Clear the last one since its double
+  bodyPositions.resize(bodyPositions.size()-1);  
+  NTotal2 = particleCount-1;
+  
+  LOGF(stderr, "NTotal:  %d\tper proc: %d\tFor ourself: %d \n", NTotal, perProc, (int)bodiesIDs.size());
+}
+
+void read_tipsy_file_parallel(vector<real4> &bodyPositions, vector<real4> &bodyVelocities,
+                              vector<int> &bodiesIDs,  float eps2, string fileName, 
+                              int rank, int procs, int &NTotal2, int &NFirst, 
+                              int &NSecond, int &NThird, octree *tree,
+                              vector<real4> &dustPositions, vector<real4> &dustVelocities,
+                              vector<int> &dustIDs, int reduce_bodies_factor,
+                              int reduce_dust_factor)  
+{
+  //Process 0 does the file reading and sends the data
+  //to the other processes
+  /* 
+
+     Read in our custom version of the tipsy file format.
+     Most important change is that we store particle id on the 
+     location where previously the potential was stored.
+  */
+  
+  
+  char fullFileName[256];
+  sprintf(fullFileName, "%s", fileName.c_str());
+
+  LOG("Trying to read file: %s \n", fullFileName);
+  
+  
+  
+  ifstream inputFile(fullFileName, ios::in | ios::binary);
+  if(!inputFile.is_open())
+  {
+    LOG("Can't open input file \n");
+    exit(0);
+  }
+  
+  dump  h;
+  inputFile.read((char*)&h, sizeof(h));  
+
+  int NTotal;
+  int idummy;
+  real4 positions;
+  real4 velocity;
+
+     
+  //Read tipsy header  
+  NTotal        = h.nbodies;
+  NFirst        = h.ndark;
+  NSecond       = h.nstar;
+  NThird        = h.nsph;
+
+  tree->set_t_current((float) h.time);
+  
+  //Rough divide
+  uint perProc = (NTotal / procs) /reduce_bodies_factor;
+  bodyPositions.reserve(perProc+10);
+  bodyVelocities.reserve(perProc+10);
+  bodiesIDs.reserve(perProc+10);
+  perProc -= 1;
+
+  //Start reading
+  int particleCount = 0;
+  int procCntr = 1;
+  
+  dark_particle d;
+  star_particle s;
+
+  int globalParticleCount = 0;
+  int bodyCount = 0;
+  int dustCount = 0;
+  
+  for(int i=0; i < NTotal; i++)
+  {
+    if(i < NFirst)
+    {
+      inputFile.read((char*)&d, sizeof(d));
+      velocity.w        = d.eps;
+      positions.w       = d.mass;
+      positions.x       = d.pos[0];
+      positions.y       = d.pos[1];
+      positions.z       = d.pos[2];
+      velocity.x        = d.vel[0];
+      velocity.y        = d.vel[1];
+      velocity.z        = d.vel[2];
+      idummy            = d.phi;
+    }
+    else
+    {
+      inputFile.read((char*)&s, sizeof(s));
+      velocity.w        = s.eps;
+      positions.w       = s.mass;
+      positions.x       = s.pos[0];
+      positions.y       = s.pos[1];
+      positions.z       = s.pos[2];
+      velocity.x        = s.vel[0];
+      velocity.y        = s.vel[1];
+      velocity.z        = s.vel[2];
+      idummy            = s.phi;
+    }
+
+
+    if(positions.z < -10e10)
+    {
+       fprintf(stderr," Removing particle %d because of Z is: %f \n", globalParticleCount, positions.z);
+       continue;
+    }
+
+
+	globalParticleCount++;
+   
+    #ifdef USE_DUST
+      if(idummy >= 50000000 && idummy < 100000000)
+      {
+        dustCount++;
+        if( dustCount % reduce_dust_factor == 0 ) 
+          positions.w *= reduce_dust_factor;
+
+        if( dustCount % reduce_dust_factor != 0 )
+          continue;
+        dustPositions.push_back(positions);
+        dustVelocities.push_back(velocity);
+        dustIDs.push_back(idummy);      
+      }
+      else
+      {
+        bodyCount++;
+        if( bodyCount % reduce_bodies_factor == 0 ) 
+		      positions.w *= reduce_bodies_factor;
+
+	      if( bodyCount % reduce_bodies_factor != 0 )
+		      continue;
+        bodyPositions.push_back(positions);
+        bodyVelocities.push_back(velocity);
+        bodiesIDs.push_back(idummy);  
+      }
+
+    
+    #else
+      if( globalParticleCount % reduce_bodies_factor == 0 ) 
+        positions.w *= reduce_bodies_factor;
+
+      if( globalParticleCount % reduce_bodies_factor != 0 )
+        continue;
+      bodyPositions.push_back(positions);
+      bodyVelocities.push_back(velocity);
+      bodiesIDs.push_back(idummy);  
+    #endif
+    
+    particleCount++;
+  
+  
+    if(bodyPositions.size() > perProc && procCntr != procs)
+    { 
+      tree->ICSend(procCntr,  &bodyPositions[0], &bodyVelocities[0],  &bodiesIDs[0], (int)bodyPositions.size());
+      procCntr++;
+      
+      bodyPositions.clear();
+      bodyVelocities.clear();
+      bodiesIDs.clear();
+    }
+  }//end while
+  
+  inputFile.close();
+  
+  //Clear the last one since its double
+//   bodyPositions.resize(bodyPositions.size()-1);  
+//   NTotal2 = particleCount-1;
+  NTotal2 = particleCount;
+  LOGF(stderr,"NTotal: %d\tper proc: %d\tFor ourself: %d \tNDust: %d \n",
+               NTotal, perProc, (int)bodiesIDs.size(), (int)dustPositions.size());
+}
+
+
+void read_generate_cube(vector<real4> &bodyPositions, vector<real4> &bodyVelocities,
+                              vector<int> &bodiesIDs,  float eps2, string fileName, 
+                              int rank, int procs, int &NTotal2, int &NFirst, 
+                              int &NSecond, int &NThird, octree *tree,
+                              vector<real4> &dustPositions, vector<real4> &dustVelocities,
+                              vector<int> &dustIDs, int reduce_bodies_factor,
+                              int reduce_dust_factor)  
+{
+  //Process 0 does the file reading and sends the data
+  //to the other processes
+  /* 
+
+     Read in our custom version of the tipsy file format.
+     Most important change is that we store particle id on the 
+     location where previously the potential was stored.
+  */
+  
+
+  int NTotal;
+  int idummy;
+  real4 positions;
+  real4 velocity;
+
+     
+  //Read tipsy header  
+  NTotal        = (int)std::pow(2.0, 22);
+  NFirst        = NTotal;
+  NSecond       = 0;
+  NThird        = 0;
+
+  fprintf(stderr,"Going to generate a random cube , number of particles: %d \n", NTotal);
+
+  tree->set_t_current((float) 0);
+  
+  //Rough divide
+  uint perProc = NTotal / procs;
+  bodyPositions.reserve(perProc+10);
+  bodyVelocities.reserve(perProc+10);
+  bodiesIDs.reserve(perProc+10);
+  perProc -= 1;
+
+  //Start reading
+  int particleCount = 0;
+  int procCntr = 1;
+  
+
+  int globalParticleCount = 0;
+
+  float mass = 1.0  / NTotal;
+  
+  for(int i=0; i < NTotal; i++)
+  {
+      velocity.w        = 0;
+      positions.w       = mass;
+      positions.x       = drand48();
+      positions.y       = drand48();
+      positions.z       = drand48();
+      velocity.x        = 0.001*drand48();
+      velocity.y        = 0.001*drand48();
+      velocity.z        = 0.001*drand48();
+
+      globalParticleCount++;
+      bodyPositions.push_back(positions);
+      bodyVelocities.push_back(velocity);
+      bodiesIDs.push_back(globalParticleCount);  
+  
+    if(bodyPositions.size() > perProc && procCntr != procs)
+    { 
+      tree->ICSend(procCntr,  &bodyPositions[0], &bodyVelocities[0],  &bodiesIDs[0], (int)bodyPositions.size());
+      procCntr++;
+      
+      bodyPositions.clear();
+      bodyVelocities.clear();
+      bodiesIDs.clear();
+    }
+  }//end while
+  
+  //Clear the last one since its double
+//   bodyPositions.resize(bodyPositions.size()-1);  
+//   NTotal2 = particleCount-1;
+  NTotal2 = NTotal;
+  LOGF(stderr,"NTotal: %d\tper proc: %d\tFor ourself: %d \tNDust: %d \n",
+               NTotal, perProc, (int)bodiesIDs.size(), (int)dustPositions.size());
+}
+
+
+double rot[3][3];
+
+void rotmat(double i,double w)
+{
+    rot[0][0] = cos(w);
+    rot[0][1] = -cos(i)*sin(w);
+    rot[0][2] = -sin(i)*sin(w);
+    rot[1][0] = sin(w);
+    rot[1][1] = cos(i)*cos(w);
+    rot[1][2] = sin(i)*cos(w);
+    rot[2][0] = 0.0;
+    rot[2][1] = -sin(i);
+    rot[2][2] = cos(i);
+    fprintf(stderr,"%g %g %g\n",rot[0][0], rot[0][1], rot[0][2]);
+    fprintf(stderr,"%g %g %g\n",rot[1][0], rot[1][1], rot[1][2]);
+    fprintf(stderr,"%g %g %g\n",rot[2][0], rot[2][1], rot[2][2]);
+}
+
+void rotate(double rot[3][3],float *vin)
+{
+    static double vout[3];
+
+    for(int i=0; i<3; i++) {
+      vout[i] = 0;
+      for(int j=0; j<3; j++)
+        vout[i] += rot[i][j] * vin[j]; 
+      /* Remember the rotation matrix is the transpose of rot */
+    }
+    for(int i=0; i<3; i++)
+            vin[i] = (float) vout[i];
+}
+
+void euler(vector<real4> &bodyPositions,
+           vector<real4> &bodyVelocities,
+           double inc, double omega)
+{
+  rotmat(inc,omega);
+  size_t nobj = bodyPositions.size();
+  for(uint i=0; i < nobj; i++)
+  {
+      float r[3], v[3];
+      r[0] = bodyPositions[i].x;
+      r[1] = bodyPositions[i].y;
+      r[2] = bodyPositions[i].z;
+      v[0] = bodyVelocities[i].x;
+      v[1] = bodyVelocities[i].y;
+      v[2] = bodyVelocities[i].z;
+
+      rotate(rot,r);
+      rotate(rot,v);
+
+      bodyPositions[i].x = r[0]; 
+      bodyPositions[i].y = r[1]; 
+      bodyPositions[i].z = r[2]; 
+      bodyVelocities[i].x = v[0];
+      bodyVelocities[i].y = v[1];
+      bodyVelocities[i].z = v[2];
+  }
+}
+
+
+
+double centerGalaxy(vector<real4> &bodyPositions,
+                    vector<real4> &bodyVelocities)
+{
+    size_t nobj = bodyPositions.size();
+    float xc, yc, zc, vxc, vyc, vzc, mtot;
+  
+
+    mtot = 0;
+    xc = yc = zc = vxc = vyc = vzc = 0;
+    for(uint i=0; i< nobj; i++) {
+            xc   += bodyPositions[i].w*bodyPositions[i].x;
+            yc   += bodyPositions[i].w*bodyPositions[i].y;
+            zc   += bodyPositions[i].w*bodyPositions[i].z;
+            vxc  += bodyPositions[i].w*bodyVelocities[i].x;
+            vyc  += bodyPositions[i].w*bodyVelocities[i].y;
+            vzc  += bodyPositions[i].w*bodyVelocities[i].z;
+            mtot += bodyPositions[i].w;
+    }
+    xc /= mtot;
+    yc /= mtot;
+    zc /= mtot;
+    vxc /= mtot;
+    vyc /= mtot;
+    vzc /= mtot;
+    for(uint i=0; i< nobj; i++)
+    {
+      bodyPositions[i].x  -= xc;
+      bodyPositions[i].y  -= yc;
+      bodyPositions[i].z  -= zc;
+      bodyVelocities[i].x -= vxc;
+      bodyVelocities[i].y -= vyc;
+      bodyVelocities[i].z -= vzc;
+    }
+    
+    return mtot;
+}
+
+
+
+
+int setupMergerModel(vector<real4> &bodyPositions1,
+                     vector<real4> &bodyVelocities1,
+                     vector<int>   &bodyIDs1,
+                     vector<real4> &bodyPositions2,
+                     vector<real4> &bodyVelocities2,
+                     vector<int>   &bodyIDs2){
+        uint i;
+        double ds=1.0, vs, ms=1.0;
+        double mu1, mu2, vp;
+        double b=1.0, rsep=10.0;
+        double x, y, vx, vy, x1, y1, vx1, vy1 ,  x2, y2, vx2, vy2;
+        double theta, tcoll;
+        double inc1=0, omega1=0;
+        double inc2=0, omega2=0;
+        
+        
+        ds = 1.52;
+        ms = 1.0;
+        b = 10;
+        rsep = 168;
+        inc1 = 0;
+        omega1 = 0;
+        inc2 = 180;
+        omega2 = 0;
+
+
+        if(ds < 0)
+        {
+          cout << "Enter size ratio (for gal2): ";
+          cin >> ds;
+          cout << "Enter mass ratio (for gal2): ";
+          cin >> ms;
+          cout << "Enter relative impact parameter: ";
+          cin >> b;
+          
+          cout << "Enter initial separation: ";
+          cin >> rsep;
+          cout << "Enter Euler angles for first galaxy:\n";
+          cout << "Enter inclination: ";
+          cin >> inc1;
+          cout << "Enter omega: ";
+          cin >> omega1;
+          cout << "Enter Euler angles for second galaxy:\n";
+          cout << "Enter inclination: ";
+          cin >> inc2;
+          cout << "Enter omega: ";
+          cin >> omega2;
+        }
+
+
+        double inc1_inp, inc2_inp, om2_inp, om1_inp;
+        
+        inc1_inp = inc1;
+        inc2_inp = inc2;
+        om1_inp = omega1;
+        om2_inp = omega1;
+
+
+        inc1   *= M_PI/180.;
+        inc2   *= M_PI/180.;
+        omega1 *= M_PI/180.;
+        omega2 *= M_PI/180.;
+        omega1 += M_PI;
+
+        fprintf(stderr,"Size ratio: %f Mass ratio: %f \n", ds, ms);
+        fprintf(stderr,"Relative impact par: %f Initial sep: %f \n", b, rsep);
+        fprintf(stderr,"Euler angles first: %f %f Second: %f %f \n",
+                        inc1, omega1,inc2,omega2);
+
+        vs = sqrt(ms/ds); /* adjustment for internal velocities */
+
+
+        //Center everything in galaxy 1 and galaxy 2
+        double galaxyMass1 = centerGalaxy(bodyPositions1, bodyVelocities1);
+        double galaxyMass2 = centerGalaxy(bodyPositions2, bodyVelocities2);
+
+
+        galaxyMass2 = ms*galaxyMass2;             //Adjust total mass
+
+        mu1 =  galaxyMass2/(galaxyMass1 + galaxyMass2);
+        mu2 = -galaxyMass1/(galaxyMass1 + galaxyMass2);
+        
+        double m1 = galaxyMass1;
+        double m2 = galaxyMass2;
+
+        
+        /* Relative Parabolic orbit - anti-clockwise */
+        if( b > 0 ) {
+                vp = sqrt(2.0*(m1 + m2)/b);
+                x = 2*b - rsep;  y = -2*sqrt(b*(rsep-b));
+                vx = sqrt(b*(rsep-b))*vp/rsep; vy = b*vp/rsep;
+        }
+        else {
+                b = 0;
+                x = - rsep; y = 0.0;
+                vx = sqrt(2.0*(m1 + m2)/rsep); vy = 0.0;
+        }
+
+        /* Calculate collison time */
+        if( b > 0 ) {
+                theta = atan2(y,x);
+                tcoll = (0.5*tan(0.5*theta) + pow(tan(0.5*theta),3.0)/6.)*4*b/vp;
+                fprintf(stderr,"Collision time is t=%g\n",tcoll);
+        }
+        else {
+                tcoll = -pow(rsep,1.5)/(1.5*sqrt(2.0*(m1+m2)));
+                fprintf(stderr,"Collision time is t=%g\n",tcoll);
+        }
+
+        /* These are the orbital adjustments for a parabolic encounter */
+        /* Change to centre of mass frame */
+        x1  =  mu1*x;  x2   =  mu2*x;     
+        y1  =  mu1*y;  y2   =  mu2*y;
+        vx1 =  mu1*vx; vx2  =  mu2*vx;
+        vy1 =  mu1*vy; vy2  =  mu2*vy;
+
+
+        /* Rotate the galaxies */
+        euler(bodyPositions1, bodyVelocities1, inc1,omega1);
+        euler(bodyPositions2, bodyVelocities2, inc2,omega2);
+
+        for(i=0; i< bodyPositions1.size(); i++) {
+                bodyPositions1[i].x  = (float) (bodyPositions1[i].x  + x1);
+                bodyPositions1[i].y  = (float) (bodyPositions1[i].y  + y1);
+                bodyVelocities1[i].x = (float) (bodyVelocities1[i].x + vx1);
+                bodyVelocities1[i].y = (float) (bodyVelocities1[i].y + vy1);
+        }
+        /* Rescale and reset the second galaxy */
+        for(i=0; i< bodyPositions2.size(); i++) {
+                bodyPositions2[i].w = (float) ms*bodyPositions2[i].w;
+                bodyPositions2[i].x = (float) (ds*bodyPositions2[i].x + x2);
+                bodyPositions2[i].y = (float) (ds*bodyPositions2[i].y + y2);
+                bodyPositions2[i].z = (float) ds*bodyPositions2[i].z;
+                bodyVelocities2[i].x = (float) (vs*bodyVelocities2[i].x + vx2);
+                bodyVelocities2[i].y = (float) (vs*bodyVelocities2[i].y + vy2);
+                bodyVelocities2[i].z = (float) vs*bodyVelocities2[i].z;
+        }
+
+
+        //Put them into one 
+        bodyPositions1.insert(bodyPositions1.end(),  bodyPositions2.begin(), bodyPositions2.end());
+        bodyVelocities1.insert(bodyVelocities1.end(), bodyVelocities2.begin(), bodyVelocities2.end());
+        bodyIDs1.insert(bodyIDs1.end(), bodyIDs2.begin(), bodyIDs2.end());
+  
+
+        return 0;
+}
+
+
+long long my_dev::base_mem::currentMemUsage;
+long long my_dev::base_mem::maxMemUsage;
+
+int main(int argc, char** argv)
+{
+  my_dev::base_mem::currentMemUsage = 0;
+  my_dev::base_mem::maxMemUsage     = 0;
+
+  vector<real4> bodyPositions;
+  vector<real4> bodyVelocities;
+  vector<int>   bodyIDs;
+
+  vector<real4> dustPositions;
+  vector<real4> dustVelocities;
+  vector<int>   dustIDs;  
+  
+
+  float eps      = 0.05f;
+  float theta    = 0.75f;
+  float timeStep = 1.0f / 16.0f;
+  float tEnd     = 1;
+  int   iterEnd  = (1 << 30);
+  devID      = 0;
+  renderDevID = 0;
+
+  string fileName       =  "";
+  string logFileName    = "gpuLog.log";
+  string snapshotFile   = "snapshot_";
+  float snapshotIter     = -1;
+  float  remoDistance   = -1.0;
+  int    snapShotAdd    =  0;
+  int rebuild_tree_rate = 2;
+  int reduce_bodies_factor = 1;
+  int reduce_dust_factor = 1;
+  string fullScreenMode = "";
+  bool direct = false;
+  bool fullscreen = false;
+  bool displayFPS = false;
+  bool diskmode = false;
+  bool stereo   = false;
+
+#if ENABLE_LOG
+  ENABLE_RUNTIME_LOG = false;
+  PREPEND_RANK       = false;
+#endif
+
+#ifdef USE_OPENGL
+	TstartGlow = 0.0;
+	dTstartGlow = 1.0;
+#endif
+
+  int nPlummer  = -1;
+  int nSphere   = -1;
+  int nMilkyWay = -1;
+  int nMWfork   =  4;
+	/************** beg - command line arguments ********/
+#if 1
+	{
+		AnyOption opt;
+
+#define ADDUSAGE(line) {{std::stringstream oss; oss << line; opt.addUsage(oss.str());}}
+
+		ADDUSAGE(" ");
+		ADDUSAGE("Usage");
+		ADDUSAGE(" ");
+		ADDUSAGE(" -h  --help             Prints this help ");
+		ADDUSAGE(" -i  --infile #         Input snapshot filename ");
+		ADDUSAGE("     --logfile #        Log filename [" << logFileName << "]");
+		ADDUSAGE("     --dev #            Device ID [" << devID << "]");
+		ADDUSAGE("     --renderdev #      Rendering Device ID [" << renderDevID << "]");
+		ADDUSAGE(" -t  --dt #             time step [" << timeStep << "]");
+		ADDUSAGE(" -T  --tend #           N-body end time [" << tEnd << "]");
+		ADDUSAGE(" -I  --iend #           N-body end iteration [" << iterEnd << "]");
+		ADDUSAGE(" -e  --eps #            softening (will be squared) [" << eps << "]");
+		ADDUSAGE(" -o  --theta #          opening angle (theta) [" <<theta << "]");
+		ADDUSAGE("     --snapname #       snapshot base name (N-body time is appended in 000000 format) [" << snapshotFile << "]");
+		ADDUSAGE("     --snapiter #       snapshot iteration (N-body time) [" << snapshotIter << "]");
+		ADDUSAGE("     --rmdist #         Particle removal distance (-1 to disable) [" << remoDistance << "]");
+		ADDUSAGE("     --valueadd #       value to add to the snapshot [" << snapShotAdd << "]");
+		ADDUSAGE(" -r  --rebuild #        rebuild tree every # steps [" << rebuild_tree_rate << "]");
+		ADDUSAGE("     --reducebodies #   cut down bodies dataset by # factor ");
+#ifdef USE_DUST
+    ADDUSAGE("     --reducedust #     cut down dust dataset by # factor ");
+#endif
+#if ENABLE_LOG
+    ADDUSAGE("     --log              enable logging ");
+    ADDUSAGE("     --prepend-rank     prepend the MPI rank in front of the log-lines ");
+#endif
+        ADDUSAGE("     --direct           enable N^2 direct gravitation [" << (direct ? "on" : "off") << "]");
+#ifdef USE_OPENGL
+		ADDUSAGE("     --fullscreen #     set fullscreen mode string");
+    ADDUSAGE("     --displayfps       enable on-screen FPS display");
+		ADDUSAGE("     --Tglow  #         enable glow @ # Myr [" << TstartGlow << "]");
+		ADDUSAGE("     --dTglow  #        reach full brightness in @ # Myr [" << dTstartGlow << "]");
+		ADDUSAGE("     --stereo           enable stereo rendering");
+#endif
+
+
+		ADDUSAGE("     --plummer  #      use plummer model with # particles per proc");
+#ifdef GALACTICS
+		ADDUSAGE("     --milkyway #      use Milky Way model with # particles per proc");
+		ADDUSAGE("     --mwfork   #      fork Milky Way generator into # processes [" << nMWfork << "]");
+#endif
+		ADDUSAGE("     --sphere   #      use spherical model with # particles per proc");
+    ADDUSAGE("     --diskmode        use diskmode to read same input file all MPI taks and randomly shuffle its positions");
+		ADDUSAGE(" ");
+
+
+		opt.setFlag( "help" ,   'h');
+		opt.setFlag( "diskmode");
+		opt.setOption( "infile",  'i');
+		opt.setOption( "dt",      't' );
+		opt.setOption( "tend",    'T' );
+		opt.setOption( "iend",    'I' );
+		opt.setOption( "eps",     'e' );
+		opt.setOption( "theta",   'o' );
+		opt.setOption( "rebuild", 'r' );
+    opt.setOption( "plummer");
+#ifdef GALACTICS
+    opt.setOption( "milkyway");
+    opt.setOption( "mwfork");
+#endif
+    opt.setOption( "sphere");
+    opt.setOption( "dev" );
+    opt.setOption( "renderdev" );
+    opt.setOption( "logfile" );
+    opt.setOption( "snapname");
+    opt.setOption( "snapiter");
+    opt.setOption( "rmdist");
+    opt.setOption( "valueadd");
+    opt.setOption( "reducebodies");
+#ifdef USE_DUST
+    opt.setOption( "reducedust");
+#endif /* USE_DUST */
+#if ENABLE_LOG
+    opt.setFlag("log");
+    opt.setFlag("prepend-rank");
+#endif
+    opt.setFlag("direct");
+#ifdef USE_OPENGL
+    opt.setOption( "fullscreen");
+    opt.setOption( "Tglow");
+    opt.setOption( "dTglow");
+    opt.setFlag("displayfps");
+    opt.setFlag("stereo");
+#endif
+
+    opt.processCommandArgs( argc, argv );
+
+
+    if( ! opt.hasOptions()) { /* print usage if no options */
+      opt.printUsage();
+      exit(0);
+    }
+
+    if( opt.getFlag( "help" ) || opt.getFlag( 'h' ) ) 
+    {
+      opt.printUsage();
+      exit(0);
+    }
+
+    if (opt.getFlag("direct")) direct = true;
+    if (opt.getFlag("displayfps")) displayFPS = true;
+    if (opt.getFlag("diskmode")) diskmode = true;
+    if(opt.getFlag("stereo"))   stereo = true;
+
+#if ENABLE_LOG
+    if (opt.getFlag("log"))           ENABLE_RUNTIME_LOG = true;
+    if (opt.getFlag("prepend-rank"))  PREPEND_RANK       = true;
+#endif    
+    char *optarg = NULL;
+    if ((optarg = opt.getValue("infile")))       fileName           = string(optarg);
+    if ((optarg = opt.getValue("plummer")))      nPlummer           = atoi(optarg);
+    if ((optarg = opt.getValue("milkyway")))     nMilkyWay          = atoi(optarg);
+    if ((optarg = opt.getValue("mwfork")))       nMWfork            = atoi(optarg);
+    if ((optarg = opt.getValue("sphere")))       nSphere            = atoi(optarg);
+    if ((optarg = opt.getValue("logfile")))      logFileName        = string(optarg);
+    if ((optarg = opt.getValue("dev")))          devID              = atoi  (optarg);
+    renderDevID = devID;
+    if ((optarg = opt.getValue("renderdev")))    renderDevID        = atoi  (optarg);
+    if ((optarg = opt.getValue("dt")))           timeStep           = (float) atof  (optarg);
+    if ((optarg = opt.getValue("tend")))         tEnd               = (float) atof  (optarg);
+    if ((optarg = opt.getValue("iend")))         iterEnd            = atoi  (optarg);
+    if ((optarg = opt.getValue("eps")))          eps                = (float) atof  (optarg);
+    if ((optarg = opt.getValue("theta")))        theta              = (float) atof  (optarg);
+    if ((optarg = opt.getValue("snapname")))     snapshotFile       = string(optarg);
+    if ((optarg = opt.getValue("snapiter")))     snapshotIter       = (float) atof  (optarg);
+    if ((optarg = opt.getValue("rmdist")))       remoDistance       = (float) atof  (optarg);
+    if ((optarg = opt.getValue("valueadd")))     snapShotAdd        = atoi  (optarg);
+    if ((optarg = opt.getValue("rebuild")))      rebuild_tree_rate  = atoi  (optarg);
+    if ((optarg = opt.getValue("reducebodies"))) reduce_bodies_factor = atoi  (optarg);
+    if ((optarg = opt.getValue("reducedust")))	 reduce_dust_factor = atoi  (optarg);
+#if USE_OPENGL
+    if ((optarg = opt.getValue("fullscreen")))	 fullScreenMode     = string(optarg);
+    if ((optarg = opt.getValue("Tglow")))	 TstartGlow  = (float)atof(optarg);
+    if ((optarg = opt.getValue("dTglow")))	 dTstartGlow  = (float)atof(optarg);
+    dTstartGlow = std::max(dTstartGlow, 1.0f);
+#endif
+    if (fileName.empty() && nPlummer == -1 && nSphere == -1 && nMilkyWay == -1)
+    {
+      opt.printUsage();
+      exit(0);
+    }
+
+#undef ADDUSAGE
+  }
+#endif
+  /************** end - command line arguments ********/
+
+
+  int NTotal, NFirst, NSecond, NThird;
+  NTotal = NFirst = NSecond = NThird = 0;
+
+#ifdef USE_OPENGL
+  // create OpenGL context first, and register for interop
+  initGL(argc, argv, fullScreenMode.c_str(), stereo);
+  cudaGLSetGLDevice(devID);
+#endif
+
+  initTimers();
+
+  int pid = -1;
+#ifdef WIN32
+  pid = _getpid();
+#else
+  pid = (int)getpid();
+#endif
+  //Used for profiler, note this has to be done before initing to
+  //octree otherwise it has no effect...Therefore use pid instead of mpi procId
+  char *gpu_prof_log;
+  gpu_prof_log=getenv("CUDA_PROFILE_LOG");
+  if(gpu_prof_log){
+    char tmp[50];
+    sprintf(tmp,"process_%d_%s",pid,gpu_prof_log);
+#ifdef WIN32
+    //        SetEnvironmentVariable("CUDA_PROFILE_LOG", tmp);
+#else
+    //        setenv("CUDA_PROFILE_LOG",tmp,1);
+    LOGF(stderr, "TESTING log on proc: %d val: %s \n", pid, tmp);
+#endif
+  }
+
+
+  //Creat the octree class and set the properties
+  octree *tree = new octree(argv, devID, theta, eps, snapshotFile, snapshotIter,  timeStep, tEnd, iterEnd, (int)remoDistance, snapShotAdd, rebuild_tree_rate, direct);
+
+  double tStartup = tree->get_time();
+
+  //Get parallel processing information  
+  int procId = tree->mpiGetRank();
+  int nProcs = tree->mpiGetNProcs();
+
+  if (procId == 0)
+  {
+    //NOte cant use LOGF here since MPI isnt initialized yet
+    cerr << "[INIT]\tUsed settings: \n";
+    cerr << "[INIT]\tInput filename " << fileName << endl;
+    cerr << "[INIT]\tLog filename " << logFileName << endl;
+    cerr << "[INIT]\tTheta: \t\t"             << theta        << "\t\teps: \t\t"          << eps << endl;
+    cerr << "[INIT]\tTimestep: \t"          << timeStep     << "\t\ttEnd: \t\t"         << tEnd << endl;
+    cerr << "[INIT]\titerEnd: \t" << iterEnd << endl;
+    cerr << "[INIT]\tsnapshotFile: \t"      << snapshotFile << "\tsnapshotIter: \t" << snapshotIter << endl;
+    cerr << "[INIT]\tInput file: \t"        << fileName     << "\t\tdevID: \t\t"        << devID << endl;
+    cerr << "[INIT]\tRemove dist: \t"   << remoDistance << endl;
+    cerr << "[INIT]\tSnapshot Addition: \t"  << snapShotAdd << endl;
+    cerr << "[INIT]\tRebuild tree every " << rebuild_tree_rate << " timestep\n";
+
+
+    if( reduce_bodies_factor > 1 )
+      cout << "[INIT]\tReduce number of non-dust bodies by " << reduce_bodies_factor << " \n";
+    if( reduce_dust_factor > 1 )
+      cout << "[INIT]\tReduce number of dust bodies by " << reduce_dust_factor << " \n";
+
+#if ENABLE_LOG
+    if (ENABLE_RUNTIME_LOG)
+      cerr << "[INIT]\tRuntime logging is ENABLED \n";
+    else
+      cerr << "[INIT]\tRuntime logging is DISABLED \n";
+#endif
+    cerr << "[INIT]\tDirect gravitation is " << (direct ? "ENABLED" : "DISABLED") << endl;
+#if USE_OPENGL
+    cerr << "[INIT]\tTglow = " << TstartGlow << endl;
+    cerr << "[INIT]\tdTglow = " << dTstartGlow << endl;
+    cerr << "[INIT]\tstereo = " << stereo << endl;
+#endif
+#ifdef USE_MPI                
+    cerr << "[INIT]\tCode is built WITH MPI Support \n";
+#else
+    cerr << "[INIT]\tCode is built WITHOUT MPI Support \n";
+#endif
+  }
+
+#ifdef USE_MPI
+#if 1
+  omp_set_num_threads(16);
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(pthread_self()  , sizeof( cpu_set_t ), &cpuset );
+
+
+    int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+
+    int i, set=-1;
+    for (i = 0; i < CPU_SETSIZE; i++)
+      if (CPU_ISSET(i, &cpuset))
+        set = i;
+    //    fprintf(stderr,"[Proc: %d ] Thread %d bound to: %d Total cores: %d\n",
+    //        procId, tid,  set, num_cores);
+  }
+#endif
+
+
+
+
+#if 0
+  omp_set_num_threads(4);
+  //default
+  // int cpulist[] = {0,1,2,3,8,9,10,11};
+  int cpulist[] = {0,1,2,3, 8,9,10,11, 4,5,6,7, 12,13,14,15}; //HA-PACS
+  //int cpulist[] = {0,1,2,3,4,5,6,7};
+  //int cpulist[] = {0,2,4,6, 8,10,12,14};
+  //int cpulist[] = {1,3,5,7, 9,11,13,15};
+  //int cpulist[] = {1,9,5,11, 3,7,13,15};
+  //int cpulist[] = {1,15,3,13, 2,4,6,8};
+  //int cpulist[] = {1,1,1,1, 1,1,1,1};
+
+
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    //int core_id = procId*4+tid;
+    int core_id = (procId%4)*4+tid;
+    core_id = cpulist[core_id];
+
+    int num_cores = sysconf(_SC_NPROCESSORS_ONLN);
+    //    if (core_id >= num_cores)
+
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    CPU_SET(core_id, &cpuset);
+    pthread_t current_thread = pthread_self();
+    int return_val = pthread_setaffinity_np(current_thread, sizeof(cpu_set_t), &cpuset);
+
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(pthread_self()  , sizeof( cpu_set_t ), &cpuset );
+
+    int i, set=-1;
+    for (i = 0; i < CPU_SETSIZE; i++)
+      if (CPU_ISSET(i, &cpuset))
+        set = i;
+    //printf("CPU2: CPU %d\n", i);
+
+
+    fprintf(stderr,"Binding thread: %d of rank: %d to cpu: %d CHECK: %d Total cores: %d\n",
+        tid, procId, core_id, set, num_cores);
+
+  }
+#endif
+#endif
+
+
+
+
+
+#if ENABLE_LOG
+#ifdef USE_MPI
+  PREPEND_RANK_PROCID = procId;
+  PREPEND_RANK_NPROCS = nProcs;
+#endif
+#endif
+
+
+  if(nProcs > 1)
+  {
+    logFileName.append("-");
+
+    char buff[16];
+    sprintf(buff,"%d-%d", nProcs, procId);
+    logFileName.append(buff);
+  }
+
+  ofstream logFile(logFileName.c_str());
+
+  tree->set_context(logFile, false); //Do logging to file and enable timing (false = enabled)
+
+  if (nPlummer == -1 && nSphere == -1 && !diskmode && nMilkyWay == -1)
+  {
+    if(procId == 0)
+    {
+#ifdef TIPSYOUTPUT
+      read_tipsy_file_parallel(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, 
+          procId, nProcs, NTotal, NFirst, NSecond, NThird, tree,
+          dustPositions, dustVelocities, dustIDs, reduce_bodies_factor, reduce_dust_factor);    
+
+      //      read_generate_cube(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, 
+      //                               procId, nProcs, NTotal, NFirst, NSecond, NThird, tree,
+      //                              dustPositions, dustVelocities, dustIDs, reduce_bodies_factor, reduce_dust_factor);    
+
+#else
+      read_dumbp_file_parallel(bodyPositions, bodyVelocities, bodyIDs, eps, fileName, procId, nProcs, NTotal, NFirst, NSecond, NThird, tree, reduce_bodies_factor);
+#endif
+    }
+    else
+    {
+      tree->ICRecv(0, bodyPositions, bodyVelocities,  bodyIDs);
+    }
+  }
+  else if(nMilkyWay >= 0)
+  {
+#ifdef GALACTICS
+    if (procId == 0) printf("Using MilkyWay model with n= %d per proc, forked %d times \n", nMilkyWay, nMWfork);
+    assert(nMilkyWay > 0);
+    assert(nMWfork > 0);
+ 
+
+#if 0 /* in this setup all particles will be of equal mass (exact number are galactic-depednant)  */
+    const float fdisk  = 15.1; 
+    const float fbulge = 5.1;   
+    const float fhalo  = 242.31; 
+#else  /* here, bulge & mw particles have the same mass, but halo particles is 32x heavier */
+    const float fdisk  = 15.1; 
+    const float fbulge = 5.1; 
+    const float fhalo  = 7.5; 
+#endif
+
+    const float fsum = fdisk + fhalo + fbulge;
+
+    const int ndisk  = (int)(nMilkyWay * fdisk/fsum);
+    const int nbulge = (int)(nMilkyWay * fbulge/fsum);
+    const int nhalo  = (int)(nMilkyWay * fhalo/fsum);
+
+    assert(ndisk  > 0);
+    assert(nbulge > 0);
+    assert(nhalo  > 0);
+
+    const double t0 = tree->get_time();
+    const Galactics g(procId, nProcs, ndisk, nbulge, nhalo, nMWfork);
+    const double dt = tree->get_time() - t0;
+    if (procId == 0)
+      printf("  ndisk= %d  nbulge= %d  nhalo= %d :: ntotal= %d in %g sec\n",
+          g.get_ndisk(), g.get_nbulge(), g.get_nhalo(), g.get_ntot(), dt);
+
+    const int ntot = g.get_ntot();
+    bodyPositions.resize(ntot);
+    bodyVelocities.resize(ntot);
+    bodyIDs.resize(ntot);
+    for (int i= 0; i < ntot; i++)
+    {
+      assert(!std::isnan(g[i].x));
+      assert(!std::isnan(g[i].y));
+      assert(!std::isnan(g[i].z));
+      assert(g[i].mass > 0.0);
+      bodyIDs[i] = g[i].id;
+
+      bodyPositions[i].x = g[i].x;
+      bodyPositions[i].y = g[i].y;
+      bodyPositions[i].z = g[i].z;
+      bodyPositions[i].w = g[i].mass * 1.0/(double)nProcs;
+      
+      assert(!std::isnan(g[i].vx));
+      assert(!std::isnan(g[i].vy));
+      assert(!std::isnan(g[i].vz));
+
+      bodyVelocities[i].x = g[i].vx;
+      bodyVelocities[i].y = g[i].vy;
+      bodyVelocities[i].z = g[i].vz;
+      bodyVelocities[i].w = 0.0;
+    }
+#else
+    assert(0);
+#endif
+  }
+  else if(nPlummer >= 0)
+  {
+    if (procId == 0) printf("Using plummer model with n= %d per proc \n", nPlummer);
+    assert(nPlummer > 0);
+    const int seed = 19810614 + procId;
+    const Plummer m(nPlummer, procId, seed);
+    bodyPositions.resize(nPlummer);
+    bodyVelocities.resize(nPlummer);
+    bodyIDs.resize(nPlummer);
+    for (int i= 0; i < nPlummer; i++)
+    {
+
+      assert(!std::isnan(m.pos[i].x));
+      assert(!std::isnan(m.pos[i].y));
+      assert(!std::isnan(m.pos[i].z));
+      assert(m.mass[i] > 0.0);
+      bodyIDs[i]   = nPlummer*procId + i;
+
+      bodyPositions[i].x = m.pos[i].x;
+      bodyPositions[i].y = m.pos[i].y;
+      bodyPositions[i].z = m.pos[i].z;
+      bodyPositions[i].w = m.mass[i] * 1.0/nProcs;
+
+      bodyVelocities[i].x = m.vel[i].x;
+      bodyVelocities[i].y = m.vel[i].y;
+      bodyVelocities[i].z = m.vel[i].z;
+      bodyVelocities[i].w = 0;
+    }
+  }
+  else if (nSphere >= 0)
+  {
+    //Sphere
+    if (procId == 0) printf("Using Spherical model with n= %d per proc \n", nSphere);
+    assert(nSphere >= 0);
+    bodyPositions.resize(nSphere);
+    bodyVelocities.resize(nSphere);
+    bodyIDs.resize(nSphere);
+
+    srand48(procId+19840501);
+
+    /* generate uniform sphere */
+    int np = 0;
+    while (np < nSphere)
+    {
+      const double x = 2.0*drand48()-1.0;
+      const double y = 2.0*drand48()-1.0;
+      const double z = 2.0*drand48()-1.0;
+      const double r2 = x*x+y*y+z*z;
+      if (r2 < 1)
+      {
+        bodyIDs[np]   = nSphere*procId + np;
+
+        bodyPositions[np].x = x;
+        bodyPositions[np].y = y;
+        bodyPositions[np].z = z;
+        bodyPositions[np].w = (1.0/nSphere) * 1.0/nProcs;
+
+        bodyVelocities[np].x = 0;
+        bodyVelocities[np].y = 0;
+        bodyVelocities[np].z = 0;
+        bodyVelocities[np].w = 0;
+        np++;
+      }//if
+    }//while
+  }//else
+  else if (diskmode)
+  {
+    if (procId == 0) printf("Using diskmode with filename %s\n", fileName.c_str());
+    const int seed = procId+19840501;
+    srand48(seed);
+    const DiskShuffle disk(fileName);
+    const int np = disk.get_ntot();
+    bodyPositions.resize(np);
+    bodyVelocities.resize(np);
+    bodyIDs.resize(np);
+    for (int i= 0; i < np; i++)
+    {
+      bodyIDs[i]   = np*procId + i;
+
+      bodyPositions[i].x = disk.pos(i).x;
+      bodyPositions[i].y = disk.pos (i).y;
+      bodyPositions[i].z = disk.pos (i).z;
+      bodyPositions[i].w = disk.mass(i) * 1.0/nProcs;
+
+      bodyVelocities[i].x = disk.vel(i).x;
+      bodyVelocities[i].y = disk.vel(i).y;
+      bodyVelocities[i].z = disk.vel(i).z;
+      bodyVelocities[i].w = 0;
+    }
+  }
+  else
+    assert(0);
+
+  tree->mpiSync();
+
+
+#ifdef TIPSYOUTPUT
+  LOGF(stderr, " t_current = %g\n", tree->get_t_current());
+#endif
+
+
+  //#define SETUP_MERGER
+#ifdef SETUP_MERGER
+  vector<real4> bodyPositions2;
+  vector<real4> bodyVelocities2;
+  vector<int>   bodyIDs2;  
+
+  bodyPositions2.insert(bodyPositions2.begin(),   bodyPositions.begin(),  bodyPositions.end());
+  bodyVelocities2.insert(bodyVelocities2.begin(), bodyVelocities.begin(), bodyVelocities.end());
+  bodyIDs2.insert(bodyIDs2.begin(), bodyIDs.begin(), bodyIDs.end());
+
+
+  setupMergerModel(bodyPositions,  bodyVelocities,  bodyIDs,
+      bodyPositions2, bodyVelocities2, bodyIDs2);
+
+  NTotal *= 2;
+  NFirst *= 2;
+  NSecond *= 2;
+  NThird *= 2;
+#endif
+
+
+  //Set the properties of the data set, it only is really used by process 0, which does the 
+  //actual file I/O  
+  tree->setDataSetProperties(NTotal, NFirst, NSecond, NThird);
+
+  if(procId == 0)  
+    LOG("Dataset particle information: Ntotal: %d\tNFirst: %d\tNSecond: %d\tNThird: %d \n",
+        NTotal, NFirst, NSecond, NThird);
+
+
+  //Sanity check for standard plummer spheres
+  double mass = 0, totalMass;
+  for(unsigned int i=0; i < bodyPositions.size(); i++)
+  {
+    mass += bodyPositions[i].w;
+  }
+
+  tree->load_kernels();
+
+#ifdef USE_MPI
+  MPI_Reduce(&mass,&totalMass,1, MPI_DOUBLE, MPI_SUM,0, MPI_COMM_WORLD);
+#else
+  totalMass = mass;
+#endif
+
+  if(procId == 0)   LOGF(stderr, "Combined Mass: %f \tNTotal: %d \n", totalMass, NTotal);
+
+  LOG("Starting! Bootup time: %lg \n", tree->get_time()-tStartup);
+
+
+  double t0 = tree->get_time();
+
+  tree->localTree.setN((int)bodyPositions.size());
+  tree->allocateParticleMemory(tree->localTree);
+
+  //Load data onto the device
+  for(uint i=0; i < bodyPositions.size(); i++)
+  {
+    tree->localTree.bodies_pos[i] = bodyPositions[i];
+    tree->localTree.bodies_vel[i] = bodyVelocities[i];
+    tree->localTree.bodies_ids[i] = bodyIDs[i];
+
+    tree->localTree.bodies_Ppos[i] = bodyPositions[i];
+    tree->localTree.bodies_Pvel[i] = bodyVelocities[i];
+    tree->localTree.bodies_time[i] = make_float2(tree->get_t_current(), tree->get_t_current());
+  }
+
+  tree->localTree.bodies_time.h2d();
+  tree->localTree.bodies_pos.h2d();
+  tree->localTree.bodies_vel.h2d();
+  tree->localTree.bodies_Ppos.h2d();
+  tree->localTree.bodies_Pvel.h2d();
+  tree->localTree.bodies_ids.h2d();
+
+  //fprintf(stderr,"Send data to device proc: %d \n", procId);
+  //  tree->devContext.writeLogEvent("Send data to device\n");
+
+
+
+#ifdef USE_MPI
+  //Use sampling particles, determine frequency
+  tree->mpiSumParticleCount(tree->localTree.n); //Determine initial frequency
+#endif
+
+
+  //If required set the dust particles
+#ifdef USE_DUST
+  if( (int)dustPositions.size() > 0)
+  {
+    LOGF(stderr, "Allocating dust properties for %d dust particles \n",
+        (int)dustPositions.size());   
+    tree->localTree.setNDust((int)dustPositions.size());
+    tree->allocateDustMemory(tree->localTree);
+
+    //Load dust data onto the device
+    for(uint i=0; i < dustPositions.size(); i++)
+    {
+      tree->localTree.dust_pos[i] = dustPositions[i];
+      tree->localTree.dust_vel[i] = dustVelocities[i];
+      tree->localTree.dust_ids[i] = dustIDs[i];
+    }
+
+    tree->localTree.dust_pos.h2d();
+    tree->localTree.dust_vel.h2d();
+    tree->localTree.dust_ids.h2d();    
+  }
+#endif //ifdef USE_DUST
+
+
+#ifdef USE_MPI
+  //Startup the OMP threads
+  omp_set_num_threads(4);
+#endif
+
+
+  //Start the integration
+#ifdef USE_OPENGL
+  octree::IterationData idata;
+  initAppRenderer(argc, argv, tree, idata, displayFPS, stereo);
+  LOG("Finished!!! Took in total: %lg sec\n", tree->get_time()-t0);
+#else
+  tree->mpiSync();
+  if (procId==0)
+    fprintf(stderr, " Starting iterating\n");
+  tree->mpiSync();
+  tree->iterate(); 
+
+  LOG("Finished!!! Took in total: %lg sec\n", tree->get_time()-t0);
+
+  logFile.close();
+
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
+
+  if(tree->procId == 0)
+  {
+    LOGF(stderr, "TOTAL:   Time spent between the start of 'iterate' and the final time-step (very first step is not accounted)\n",0);
+    LOGF(stderr, "Grav:    Time spent to compute gravity, including communication (wall-clock time)\n",0);
+    LOGF(stderr, "GPUgrav: Time spent ON the GPU to compute local and LET gravity\n",0);
+    LOGF(stderr, "LET Com: Time spent in exchanging and building LET data\n",0);
+    LOGF(stderr, "Build:   Time spent in constructing the tree (incl sorting, making groups, etc.)\n",0);
+    LOGF(stderr, "Domain:  Time spent in computing new domain decomposition and exchanging particles between nodes.\n",0);
+    LOGF(stderr, "Wait:    Time spent in waiting on other processes after the gravity part.\n",0);
+  }
+
+
+  delete tree;
+  tree = NULL;
+#endif
+
+  displayTimers();
+  return 0;
+}
diff --git a/ptxtools/ptx.ll b/ptxtools/ptx.ll
new file mode 100644
index 00000000..0c3a0fd4
--- /dev/null
+++ b/ptxtools/ptx.ll
@@ -0,0 +1,79 @@
+%option yylineno
+%option noyywrap
+%option yyclass="parser::PTXLexer"
+%option prefix="ptx"
+%option c++
+
+%{
+#include "PTXLexer.h"
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#ifdef LLSETTOKEN
+#error "TOKEN is defined"
+#endif
+#define LLSETTOKEN(tok) yylval->ivalue = tok; return tok;
+%}
+
+COMMENT ("//"[^\n]*)
+TAB [\t]*
+
+%%
+{COMMENT}       {nextColumn += strlen(yytext); /* lCppComment(&yylloc); */ }
+".version"      { return TOKEN_VERSION; }
+".target"       { return TOKEN_TARGET; }
+".address_size" { return TOKEN_ADDRESS_SIZE; }
+".func"         { return TOKEN_FUNC; }
+".entry"        { return TOKEN_ENTRY; }
+".align"        { return TOKEN_ALIGN; }
+".visible"      { return TOKEN_VISIBLE; }
+".global"       { return TOKEN_GLOBAL; }
+".param"        { return TOKEN_PARAM; }
+".b0"           { LLSETTOKEN( TOKEN_B32);}   /* fix for buggy llvm-ptx generator */
+".b8"           { LLSETTOKEN( TOKEN_B8);}
+".b16"          { LLSETTOKEN( TOKEN_B16);}
+".b32"          { LLSETTOKEN( TOKEN_B32);}
+".b64"          { LLSETTOKEN( TOKEN_B64);}
+".u8"           { LLSETTOKEN( TOKEN_U8);}
+".u16"          { LLSETTOKEN( TOKEN_U16);}
+".u32"          { LLSETTOKEN( TOKEN_U32);}
+".u64"          { LLSETTOKEN( TOKEN_U64);}
+".s8"           { LLSETTOKEN( TOKEN_S8);}
+".s16"          { LLSETTOKEN( TOKEN_S16);}
+".s32"          { LLSETTOKEN( TOKEN_S32);}
+".s64"          { LLSETTOKEN( TOKEN_S64);}
+".f32"          { LLSETTOKEN( TOKEN_F32);}
+".f64"          { LLSETTOKEN( TOKEN_F64);}
+"["             { return '[';}
+"]"             { return ']';}
+"("             { return '(';}
+")"             { return ')';}
+","             { return ',';}
+";"             { return ';';}
+"="             { return '=';}
+[0-9]+\.[0-9]+ { yylval->fvalue = atof(yytext); return TOKEN_FLOAT; }
+[0-9]+   { yylval->ivalue = atoi(yytext); return TOKEN_INT; }
+[a-zA-Z0-9_]+   { strcpy(yylval->svalue, yytext); return TOKEN_STRING;}
+\n {
+ //   yylloc.last_line++;
+//    yylloc.last_column = 1;
+    nextColumn = 1;
+}
+.              ;
+%%
+
+/** Handle a C++-style comment--eat everything up until the end of the line.
+ */
+#if 0
+static void
+lCppComment(SourcePos *pos) {
+    char c;
+    do {
+        c = yyinput();
+    } while (c != 0 && c != '\n');
+    if (c == '\n') {
+        pos->last_line++;
+        pos->last_column = 1;
+    }
+}
+#endif
diff --git a/ptxtools/ptxcc.cpp b/ptxtools/ptxcc.cpp
new file mode 100644
index 00000000..dad7452d
--- /dev/null
+++ b/ptxtools/ptxcc.cpp
@@ -0,0 +1,275 @@
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <algorithm>
+#include <sys/time.h>
+#include "PTXParser.h"
+
+
+/*
+ * The C++ code below is based on the following bash-script:
+      #!/bin/sh
+
+      PTXSRC=$1__tmp_ptx.ptx
+      PTXCU=$1___tmp_ptx.cu
+      PTXSH=$1___tmp_ptx.sh
+
+      NVCCPARM=${@:2}
+
+      DEPTX=dePTX
+      NVCC=nvcc
+
+      $(cat $1 | sed 's/\.b0/\.b32/g' > $PTXSRC) &&
+      $DEPTX < $PTXSRC > $PTXCU &&
+      $NVCC -arch=sm_35 -dc $NVCCPARM -dryrun $PTXCU 2>&1 | \
+        sed 's/\#\$//g'| \
+        awk '{ if ($1 == "LIBRARIES=") print $1$2; else if ($1 == "cicc") print "cp '$PTXSRC'", $NF; else print $0 }' > $PTXSH &&
+      sh $PTXSH
+
+      # rm $PTXCU $PTXSH
+ *
+ */
+
+static char lRandomAlNum()
+{
+  const char charset[] =
+    "0123456789"
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    "abcdefghijklmnopqrstuvwxyz";
+  const size_t max_index = (sizeof(charset) - 1);
+  return charset[ rand() % max_index ];
+}
+
+static std::string lRandomString(const size_t length)
+{
+  timeval t1;
+  gettimeofday(&t1, NULL);
+  srand(t1.tv_usec * t1.tv_sec);
+  std::string str(length,0);
+  std::generate_n( str.begin(), length, lRandomAlNum);
+  return str;
+}
+
+static void lGetAllArgs(int Argc, char *Argv[], int &argc, char *argv[128]) 
+{
+  // Copy over the command line arguments (passed in)
+  for (int i = 0; i < Argc; ++i)
+    argv[i] = Argv[i];
+  argc = Argc;
+}
+const char *lGetExt (const char *fspec) 
+{
+  char *e = strrchr (fspec, '.');
+  return e;
+}
+
+static std::vector<std::string> lSplitString(const std::string &s, char delim)
+{
+  std::vector<std::string> elems;
+  std::stringstream ss(s);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    if (!item.empty())
+      elems.push_back(item);
+  }
+  return elems;
+}
+
+static void lUsage(const int ret)
+{
+  fprintf(stdout, "\nusage: ptxcc [options] file.ptx \n");
+  fprintf(stdout, "    [--help]\t\t\t\t This help\n");
+  fprintf(stdout, "    [--verbose]\t\t\t\t Be verbose\n");
+  fprintf(stdout, "    [--arch={%s}]\t\t\t GPU target architecture\n", "sm_35");
+  fprintf(stdout, "    [-o <name>]\t\t\t\t Output file name\n");
+  fprintf(stdout, "    [-Xnvcc=<arguments>]\t\t Arguments to pass through to \"nvcc\"\n");
+  fprintf(stdout, " \n");
+  exit(ret);
+}
+
+int main(int _argc, char * _argv[])
+{
+  int argc;
+  char *argv[128];
+  lGetAllArgs(_argc, _argv, argc, argv);
+
+  std::string arch="sm_35";
+  std::string filePTX;
+  std::string fileOBJ;
+  std::string extString = ".ptx";
+  bool keepTemporaries = false;
+  bool verbose = false;
+  std::string nvccArguments;
+
+  for (int i = 1; i < argc; ++i) 
+  {
+    if (!strcmp(argv[i], "--help"))
+      lUsage(0);
+    else if (!strncmp(argv[i], "--arch=", 7))
+      arch = std::string(argv[i]+7);
+    else if (!strncmp(argv[i], "--keep-temporaries", 11))
+      keepTemporaries = true;
+    else if (!strncmp(argv[i], "--verbose", 9))
+      verbose = true;
+    else if (!strncmp(argv[i], "-Xnvcc=", 7))
+      nvccArguments = std::string(argv[i]+7);
+    else if (!strcmp(argv[i], "-o"))
+    {
+      if (++i == argc)
+      {
+        fprintf(stderr, "No output file specified after -o option.\n");
+        lUsage(1);
+      }
+      fileOBJ = std::string(argv[i]);
+    }
+    else 
+    {
+      const char * ext = strrchr(argv[i], '.');
+      if (ext == NULL)
+      {
+        fprintf(stderr, " Unknown argument: %s \n", argv[i]);
+        lUsage(1);
+      }
+      else if (strncmp(ext, extString.c_str(), 4))
+      {
+        fprintf(stderr, " Unkown extension of the input file: %s \n", ext);
+        lUsage(1);
+      }
+      else if (filePTX.empty())
+      {
+        filePTX = std::string(argv[i]);
+        if (fileOBJ.empty())
+        {
+          char * baseName = argv[i];
+          while (baseName != ext)
+            fileOBJ += std::string(baseName++,1);
+        }
+        fileOBJ += ".o";
+      }
+    }
+  }
+#if 0
+  fprintf(stderr, " fileOBJ= %s\n", fileOBJ.c_str());
+  fprintf(stderr, " arch= %s\n", arch.c_str());
+  fprintf(stderr, " file= %s\n", filePTX.empty() ? "$stdin" : filePTX.c_str());
+  fprintf(stderr, " num_args= %d\n", (int)nvccArgumentList.size());
+  for (int i= 0; i < (int)nvccArgumentList.size(); i++)
+    fprintf(stderr, " arg= %d : %s \n", i, nvccArgumentList[i].c_str());
+#endif
+  assert(arch == std::string("sm_35"));
+  if (filePTX.empty())
+  {
+    fprintf(stderr, "ptxcc fatal : No input file specified; use option --help for more information\n");
+    exit(1);
+  }
+
+	// open a file handle to a particular file:
+  std::ifstream inputPTX(filePTX.c_str());
+  if (!inputPTX)
+  {
+    fprintf(stderr, "ptxcc: error: %s: No such file\n", filePTX.c_str());
+    exit(1);
+  }
+
+  std::string randomBaseName = std::string("/tmp/") + lRandomString(8) + "_" + lSplitString(lSplitString(filePTX,'/').back(),'.')[0];
+  if (verbose)
+    fprintf(stderr, "baseFileName= %s\n", randomBaseName.c_str());
+
+  std::string fileCU= randomBaseName + ".cu";
+  std::ofstream outputCU(fileCU.c_str());
+  assert(outputCU);
+
+  std::istream &  input = inputPTX;
+  std::ostream & output = outputCU;
+  std::ostream &  error = std::cerr;
+  parser::PTXLexer lexer(&input, &error);
+  parser::PTXParser state(output);
+
+	// parse through the input until there is no more:
+  //
+
+  do {
+    ptx::yyparse(lexer, state);
+  }
+  while (!input.eof());
+
+  inputPTX.close();
+  outputCU.close();
+
+  // process output from nvcc
+  //
+  /* nvcc -dc -arch=$arch -dryrun -argumentlist fileCU */
+
+  std::string fileSH= randomBaseName + ".sh";
+
+  std::string nvccExe("nvcc");
+  std::string nvccCmd;
+  nvccCmd += nvccExe + std::string(" ");
+  nvccCmd += "-dc ";
+  nvccCmd += std::string("-arch=") + arch + std::string(" ");
+  nvccCmd += "-dryrun ";
+  nvccCmd += nvccArguments + std::string(" ");
+  nvccCmd += std::string("-o ") + fileOBJ + std::string(" ");
+  nvccCmd += fileCU + std::string(" ");
+  nvccCmd += std::string("2> ") + fileSH;
+  if (verbose)
+    fprintf(stderr , "%s\n", nvccCmd.c_str());
+  const int nvccRet = std::system(nvccCmd.c_str());
+  if (nvccRet)
+    fprintf(stderr, "FAIL: %s\n", nvccCmd.c_str());
+
+
+  std::ifstream inputSH(fileSH.c_str());
+  assert(inputSH);
+  std::vector<std::string> nvccSteps;
+  while (!inputSH.eof())
+  {
+    nvccSteps.push_back(std::string());
+    std::getline(inputSH, nvccSteps.back());
+    if (nvccRet)
+      fprintf(stderr, " %s\n", nvccSteps.back().c_str());
+  }
+  inputSH.close();
+  if (nvccRet)
+    exit(-1);
+
+
+  for (int i = 0; i < (int)nvccSteps.size(); i++)
+  {
+    std::string cmd = nvccSteps[i];
+    for (int j = 0; j < (int)cmd.size()-1; j++)
+      if (cmd[j] == '#' && cmd[j+1] == '$')
+        cmd[j] = cmd[j+1] = ' ';
+    std::vector<std::string> splitCmd = lSplitString(cmd, ' ');
+
+    if (!splitCmd.empty())
+    {
+      if (splitCmd[0] == std::string("cicc"))
+        cmd = std::string("   cp ") + filePTX + std::string(" ") + splitCmd.back();
+      if (splitCmd[0] == std::string("LIBRARIES="))
+        cmd = "";
+    }
+    nvccSteps[i] = cmd;
+    if (verbose)
+      fprintf(stderr, "%3d: %s\n", i, cmd.c_str());
+    const int ret = std::system(cmd.c_str());
+    if (ret)
+    {
+      fprintf(stderr, " Something went wrong .. \n");
+      for (int j = 0; j < i; j++)
+        fprintf(stderr, "PASS: %s\n", nvccSteps[j].c_str());
+      fprintf(stderr, "FAIL: %s\n", nvccSteps[i].c_str());
+      exit(-1);
+    }
+  }
+
+  if (!keepTemporaries)
+  {
+    /* remove temporaries */
+  }
+  
+
+	
+}
diff --git a/ptxtools/ptxgen.cpp b/ptxtools/ptxgen.cpp
new file mode 100644
index 00000000..a05124b6
--- /dev/null
+++ b/ptxtools/ptxgen.cpp
@@ -0,0 +1,408 @@
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cassert>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include <nvvm.h>
+#include <sys/stat.h>
+
+
+template<typename T>
+static std::string lValueToString(const T& value)
+{
+  std::ostringstream oss;
+  oss << value;
+  return oss.str();
+}
+
+typedef struct stat Stat;
+
+
+#define PTXGENStatus int
+enum {
+  PTXGEN_SUCCESS                    = 0x0000,
+  PTXGEN_FILE_IO_ERROR              = 0x0001,
+  PTXGEN_BAD_ALLOC_ERROR            = 0x0002,
+  PTXGEN_LIBNVVM_COMPILATION_ERROR  = 0x0004,
+  PTXGEN_LIBNVVM_ERROR              = 0x0008,
+  PTXGEN_INVALID_USAGE              = 0x0010,
+  PTXGEN_LIBNVVM_HOME_UNDEFINED     = 0x0020,
+  PTXGEN_LIBNVVM_VERIFICATION_ERROR = 0x0040
+};
+
+static PTXGENStatus getLibDeviceName(const int computeArch, std::string &libDeviceName)
+{
+  const char *env = getenv("LIBNVVM_HOME");
+#ifdef LIBNVVM_HOME
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+  const std::string libnvvmPath(env ? env : TOSTRING(LIBNVVM_HOME));
+#undef TOSTRING
+#undef STRINGIFY
+#else
+  const std::string libnvvmPath(env);
+#endif
+
+  if (libnvvmPath.empty())
+  {
+    fprintf(stderr, "The environment variable LIBNVVM_HOME is undefined\n");
+    return PTXGEN_LIBNVVM_HOME_UNDEFINED;
+  }
+
+  /* Use libdevice for compute_20, if the target is not compute_20, compute_30,
+   * or compute_35. */
+  const std::string libdevice = 
+    std::string("/libdevice/libdevice.compute_") +
+    lValueToString(computeArch)+ "." +
+    lValueToString(LIBDEVICE_MAJOR_VERSION) + 
+    lValueToString(LIBDEVICE_MINOR_VERSION) +
+    ".bc";
+
+  libDeviceName = libnvvmPath + libdevice;
+
+  return PTXGEN_SUCCESS;
+}
+
+static PTXGENStatus addFileToProgram(const std::string &filename, nvvmProgram prog)
+{
+  char        *buffer;
+  size_t       size;
+  Stat         fileStat;
+
+  /* Open the input file. */
+  FILE *f = fopen(filename.c_str(), "rb");
+  if (f == NULL) {
+    fprintf(stderr, "Failed to open %s\n", filename.c_str());
+    return PTXGEN_FILE_IO_ERROR;
+  }
+
+  /* Allocate buffer for the input. */
+  fstat(fileno(f), &fileStat);
+  buffer = (char *) malloc(fileStat.st_size);
+  if (buffer == NULL) {
+    fprintf(stderr, "Failed to allocate memory\n");
+    return PTXGEN_BAD_ALLOC_ERROR;
+  }
+  size = fread(buffer, 1, fileStat.st_size, f);
+  if (ferror(f)) {
+    fprintf(stderr, "Failed to read %s\n", filename.c_str());
+    fclose(f);
+    free(buffer);
+    return PTXGEN_FILE_IO_ERROR;
+  }
+  fclose(f);
+
+  if (nvvmAddModuleToProgram(prog, buffer, size, filename.c_str()) != NVVM_SUCCESS) {
+    fprintf(stderr,
+            "Failed to add the module %s to the compilation unit\n",
+            filename.c_str());
+    free(buffer);
+    return PTXGEN_LIBNVVM_ERROR;
+  }
+
+  free(buffer);
+  return PTXGEN_SUCCESS;
+}
+
+static PTXGENStatus generatePTX(
+    std::vector<std::string> nvvmOptions, 
+    std::vector<std::string> nvvmFiles, 
+    std::ostream &out,
+    const int computeArch)
+{
+  nvvmProgram prog;
+  PTXGENStatus status;
+
+  /* Create the compiliation unit. */
+  if (nvvmCreateProgram(&prog) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to create the compilation unit\n");
+    return PTXGEN_LIBNVVM_ERROR;
+  }
+  
+
+  /* Add libdevice. */
+  std::string libDeviceName;
+  status = getLibDeviceName(computeArch, libDeviceName);
+  if (status != PTXGEN_SUCCESS) 
+  {
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+  status = addFileToProgram(libDeviceName, prog);
+  if (status != PTXGEN_SUCCESS) 
+  {
+    fprintf(stderr, "Please double-check LIBNVVM_HOME environmental variable.\n");
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+
+  /* Add the module to the compilation unit. */
+  for (int i = 0; i < (int)nvvmFiles.size(); ++i) 
+  {
+    status = addFileToProgram(nvvmFiles[i], prog);
+    if (status != PTXGEN_SUCCESS) 
+    {
+      nvvmDestroyProgram(&prog);
+      return status;
+    }
+  }
+
+  const int numOptions = nvvmOptions.size();
+  std::vector<const char*> options(numOptions);
+  for (int i = 0; i < numOptions; i++)
+    options[i] = nvvmOptions[i].c_str();
+
+  /* Verify the compilation unit. */
+  if (nvvmVerifyProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to verify the compilation unit\n");
+    status |= PTXGEN_LIBNVVM_VERIFICATION_ERROR;
+  }
+
+  /* Print warnings and errors. */
+  {
+    size_t logSize;
+    if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the compilation log size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string log(logSize,0);
+      if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the compilation log\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        fprintf(stderr, "%s\n", log.c_str());
+      }
+    }
+  }
+
+  if (status & PTXGEN_LIBNVVM_VERIFICATION_ERROR) 
+  {
+    nvvmDestroyProgram(&prog);
+    return status;
+  }
+  
+  /* Compile the compilation unit. */
+  if (nvvmCompileProgram(prog, numOptions, &options[0]) != NVVM_SUCCESS) 
+  {
+    fprintf(stderr, "Failed to generate PTX from the compilation unit\n");
+    status |= PTXGEN_LIBNVVM_COMPILATION_ERROR;
+  } 
+  else 
+  {
+    size_t ptxSize;
+    if (nvvmGetCompiledResultSize(prog, &ptxSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the PTX output size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string ptx(ptxSize,0);
+      if (nvvmGetCompiledResult(prog, &ptx[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the PTX output\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        out << ptx;
+      }
+    }
+  }
+
+  /* Print warnings and errors. */
+  {
+    size_t logSize;
+    if (nvvmGetProgramLogSize(prog, &logSize) != NVVM_SUCCESS) 
+    {
+      fprintf(stderr, "Failed to get the compilation log size\n");
+      status |= PTXGEN_LIBNVVM_ERROR;
+    } 
+    else 
+    {
+      std::string log(logSize,0);
+      if (nvvmGetProgramLog(prog, &log[0]) != NVVM_SUCCESS) 
+      {
+        fprintf(stderr, "Failed to get the compilation log\n");
+        status |= PTXGEN_LIBNVVM_ERROR;
+      } 
+      else 
+      {
+        fprintf(stderr, "%s\n", log.c_str());
+      }
+    }
+  }
+
+  /* Release the resources. */
+  nvvmDestroyProgram(&prog);
+
+  return PTXGEN_SUCCESS;
+}
+
+static void showUsage()
+{
+  fprintf(stderr,"Usage: ptxgen [OPTION]... [FILE]...\n"
+                 "  [FILE] could be a .bc file or a .ll file\n");
+}
+
+static void lUsage(const int ret)
+{
+  fprintf(stdout, "\nusage: ptxgen [options] file.[ll,bc] \n");
+  fprintf(stdout, "    [--help]\t\t This help\n");
+  fprintf(stdout, "    [--verbose]\t\t Be verbose\n");
+  fprintf(stdout, "    [--arch={%s}]\t GPU target architecture\n", "sm_35");
+  fprintf(stdout, "    [-o <name>]\t\t Output file name\n");
+  fprintf(stdout, "    [-g]\t\t Enable generation of debuggin information \n");
+  fprintf(stdout, "    [--opt=]\t\t Optimization parameters \n");
+  fprintf(stdout, "     \t\t\t    0 - disable optimizations \n");
+  fprintf(stdout, "     \t\t\t    3 - defalt, enable optimizations \n");
+  fprintf(stdout, "    [--ftz=]\t\t Flush-to-zero mode when performsing single-precision floating-point operations\n");
+  fprintf(stdout, "     \t\t\t    0 - default, preserve denormal values\n");
+  fprintf(stdout, "     \t\t\t    1 - flush denormal values to zero\n");
+  fprintf(stdout, "    [--prec-sqrt=]\t Precision mode for single-precision floating-point square root\n");
+  fprintf(stdout, "     \t\t\t    0 - use a faster approximation\n");
+  fprintf(stdout, "     \t\t\t    1 - default, use IEEE round-to-nearest mode\n");
+  fprintf(stdout, "    [--prec-div=]\t Precision mode for single-precision floating-point division and reciprocals\n");
+  fprintf(stdout, "     \t\t\t    0 - use a faster approximation\n");
+  fprintf(stdout, "     \t\t\t    1 - default, use IEEE round-to-nearest mode\n");
+  fprintf(stdout, "    [--fma=]\t\t FMA contraction mode \n");
+  fprintf(stdout, "     \t\t\t    0 - disable\n");
+  fprintf(stdout, "     \t\t\t    1 - default, enable\n");
+  fprintf(stdout, "    [--use_fast_math]\t Make use of fast maih. Implies --ftz=1 --prec-div=0 --prec-sqrt=0\n");
+  fprintf(stdout, " \n");
+  exit(ret);
+}
+
+int main(int argc, char *argv[])
+{
+  int _opt      = 3;
+  int _ftz      = 0;
+  int _precSqrt = 1;
+  int _precDiv  = 1;
+  int _fma      = 1;
+  bool _useFastMath = false;
+  bool _debug       = false;
+  bool _verbose     = false;
+  std::string _arch = "sm_35";
+  std::string fileIR, filePTX;
+
+  for (int i = 1; i < argc; ++i) 
+  {
+    if (!strcmp(argv[i], "--help"))
+      lUsage(0);
+    else if (!strncmp(argv[i], "--arch=", 7))
+      _arch = std::string(argv[i]+7);
+    else if (!strncmp(argv[i], "-g", 2))
+      _debug = true;
+    else if (!strncmp(argv[i], "--verbose", 9))
+      _verbose = true;
+    else if (!strncmp(argv[i], "--opt=", 6))
+      _opt = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--ftz=", 6))
+      _ftz = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--prec-sqrt=", 12))
+      _precSqrt = atoi(argv[i]+12);
+    else if (!strncmp(argv[i], "--prec-div=", 11))
+      _precDiv = atoi(argv[i]+11);
+    else if (!strncmp(argv[i], "--fma=", 6))
+      _fma = atoi(argv[i]+6);
+    else if (!strncmp(argv[i], "--use_fast_math", 15))
+      _useFastMath = true;
+    else if (!strcmp(argv[i], "-o"))
+    {
+      if (++i == argc)
+      {
+        fprintf(stderr, "No output file specified after -o option.\n");
+        lUsage(1);
+      }
+      filePTX = std::string(argv[i]);
+    }
+    else 
+    {
+      const char * ext = strrchr(argv[i], '.');
+      if (ext == NULL)
+      {
+        fprintf(stderr, " Unknown argument: %s \n", argv[i]);
+        lUsage(1);
+      }
+      else if (strncmp(ext, ".ll", 3) && strncmp(ext, ".bc", 3))
+      {
+        fprintf(stderr, " Unkown extension of the input file: %s \n", ext);
+        lUsage(1);
+      }
+      else if (filePTX.empty())
+      {
+        fileIR = std::string(argv[i]);
+        if (filePTX.empty())
+        {
+          char * baseName = argv[i];
+          while (baseName != ext)
+            filePTX += std::string(baseName++,1);
+        }
+        filePTX += ".ptx";
+      }
+    }
+  }
+  
+  if (fileIR.empty())
+  {
+    fprintf(stderr, "ptxgen fatal : No input file specified; use option --help for more information\n");
+    exit(1);
+  }
+
+#if 0
+  fprintf(stderr, "fileIR= %s\n", fileIR.c_str());
+  fprintf(stderr, "filePTX= %s\n", filePTX.c_str());
+  fprintf(stderr, "arch= %s\n", _arch.c_str());
+  fprintf(stderr, "debug= %s\n", _debug ? "true" : "false");
+  fprintf(stderr, "verbose= %s\n", _verbose ? "true" : "false");
+  fprintf(stderr, "opt= %d\n", _opt);
+  fprintf(stderr, "ftz= %d\n", _ftz);
+  fprintf(stderr, "prec-sqrt= %d\n", _precSqrt);
+  fprintf(stderr, "prec-div= %d\n", _precDiv);
+  fprintf(stderr, "fma= %d\n", _fma);
+  fprintf(stderr, "use_fast_math= %s\n", _useFastMath ? "true" : "false");
+#endif
+
+  int computeArch = 35;
+  assert(_arch == std::string("sm_35"));
+
+  if (_useFastMath)
+  {
+    _ftz = 1;
+    _precSqrt = _precDiv = 0;
+  }
+
+  std::vector<std::string> nvvmOptions;
+  nvvmOptions.push_back("-arch=compute_35");
+  nvvmOptions.push_back("-ftz="       + lValueToString(_ftz));
+  nvvmOptions.push_back("-prec-sqrt=" + lValueToString(_precSqrt));
+  nvvmOptions.push_back("-prec-div="  + lValueToString(_precDiv));
+  nvvmOptions.push_back("-fma="       + lValueToString(_fma));
+  if (_debug)
+    nvvmOptions.push_back("-g");
+
+  std::vector<std::string> nvvmFiles;
+  nvvmFiles.push_back(fileIR);
+
+  std::ofstream outputPTX(filePTX.c_str());
+  assert(outputPTX);
+
+  const int ret = generatePTX(nvvmOptions, nvvmFiles, outputPTX, computeArch);
+    outputPTX.open(filePTX.c_str());
+  return ret;
+}
+
diff --git a/ptxtools/ptxgrammar.cpp b/ptxtools/ptxgrammar.cpp
new file mode 100644
index 00000000..2419e654
--- /dev/null
+++ b/ptxtools/ptxgrammar.cpp
@@ -0,0 +1,1929 @@
+
+/* A Bison parser, made by GNU Bison 2.4.1.  */
+
+/* Skeleton implementation for Bison's Yacc-like parsers in C
+   
+      Copyright (C) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004, 2005, 2006
+   Free Software Foundation, Inc.
+   
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* As a special exception, you may create a larger work that contains
+   part or all of the Bison parser skeleton and distribute that work
+   under terms of your choice, so long as that work isn't itself a
+   parser generator using the skeleton or a modified version thereof
+   as a parser skeleton.  Alternatively, if you modify or redistribute
+   the parser skeleton itself, you may (at your option) remove this
+   special exception, which will cause the skeleton and the resulting
+   Bison output files to be licensed under the GNU General Public
+   License without this special exception.
+   
+   This special exception was added by the Free Software Foundation in
+   version 2.2 of Bison.  */
+
+/* C LALR(1) parser skeleton written by Richard Stallman, by
+   simplifying the original so-called "semantic" parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Bison version.  */
+#define YYBISON_VERSION "2.4.1"
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 1
+
+/* Push parsers.  */
+#define YYPUSH 0
+
+/* Pull parsers.  */
+#define YYPULL 1
+
+/* Using locations.  */
+#define YYLSP_NEEDED 1
+
+
+
+/* Copy the first part of user declarations.  */
+
+/* Line 189 of yacc.c  */
+#line 3 "ptxgrammar.yy"
+
+	#include <iostream>
+	#include "PTXParser.h"
+	#include "PTXLexer.h"
+	#include <cassert>
+	#include <cstring>
+  #include <sstream>
+
+	#define YYERROR_VERBOSE 1
+
+	#ifdef REPORT_BASE
+	#undef REPORT_BASE
+	#endif
+
+	#define REPORT_BASE 0
+
+	namespace ptx
+	{
+	
+    int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state );
+    void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state, char const* message );
+    
+    std::string yyTypeToString( int );
+	
+
+
+/* Line 189 of yacc.c  */
+#line 102 "ptxgrammar.cc"
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 1
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+/* Enabling the token table.  */
+#ifndef YYTOKEN_TABLE
+# define YYTOKEN_TABLE 0
+#endif
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     TOKEN_VERSION = 258,
+     TOKEN_TARGET = 259,
+     TOKEN_ADDRESS_SIZE = 260,
+     TOKEN_VISIBLE = 261,
+     TOKEN_FUNC = 262,
+     TOKEN_ENTRY = 263,
+     TOKEN_PARAM = 264,
+     TOKEN_ALIGN = 265,
+     TOKEN_GLOBAL = 266,
+     TOKEN_B8 = 267,
+     TOKEN_B16 = 268,
+     TOKEN_B32 = 269,
+     TOKEN_B64 = 270,
+     TOKEN_U8 = 271,
+     TOKEN_U16 = 272,
+     TOKEN_U32 = 273,
+     TOKEN_U64 = 274,
+     TOKEN_S8 = 275,
+     TOKEN_S16 = 276,
+     TOKEN_S32 = 277,
+     TOKEN_S64 = 278,
+     TOKEN_F32 = 279,
+     TOKEN_F64 = 280,
+     TOKEN_INT = 281,
+     TOKEN_FLOAT = 282,
+     TOKEN_STRING = 283
+   };
+#endif
+
+
+
+#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
+typedef union YYSTYPE
+{
+
+/* Line 214 of yacc.c  */
+#line 32 "ptxgrammar.yy"
+
+	char         svalue[1024];
+	double       fvalue;
+	int          ivalue;
+	unsigned int uvalue;
+
+
+
+/* Line 214 of yacc.c  */
+#line 175 "ptxgrammar.cc"
+} YYSTYPE;
+# define YYSTYPE_IS_TRIVIAL 1
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+#endif
+
+#if ! defined YYLTYPE && ! defined YYLTYPE_IS_DECLARED
+typedef struct YYLTYPE
+{
+  int first_line;
+  int first_column;
+  int last_line;
+  int last_column;
+} YYLTYPE;
+# define yyltype YYLTYPE /* obsolescent; will be withdrawn */
+# define YYLTYPE_IS_DECLARED 1
+# define YYLTYPE_IS_TRIVIAL 1
+#endif
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 264 of yacc.c  */
+#line 200 "ptxgrammar.cc"
+
+#ifdef short
+# undef short
+#endif
+
+#ifdef YYTYPE_UINT8
+typedef YYTYPE_UINT8 yytype_uint8;
+#else
+typedef unsigned char yytype_uint8;
+#endif
+
+#ifdef YYTYPE_INT8
+typedef YYTYPE_INT8 yytype_int8;
+#elif (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+typedef signed char yytype_int8;
+#else
+typedef short int yytype_int8;
+#endif
+
+#ifdef YYTYPE_UINT16
+typedef YYTYPE_UINT16 yytype_uint16;
+#else
+typedef unsigned short int yytype_uint16;
+#endif
+
+#ifdef YYTYPE_INT16
+typedef YYTYPE_INT16 yytype_int16;
+#else
+typedef short int yytype_int16;
+#endif
+
+#ifndef YYSIZE_T
+# ifdef __SIZE_TYPE__
+#  define YYSIZE_T __SIZE_TYPE__
+# elif defined size_t
+#  define YYSIZE_T size_t
+# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# else
+#  define YYSIZE_T unsigned int
+# endif
+#endif
+
+#define YYSIZE_MAXIMUM ((YYSIZE_T) -1)
+
+#ifndef YY_
+# if YYENABLE_NLS
+#  if ENABLE_NLS
+#   include <libintl.h> /* INFRINGES ON USER NAME SPACE */
+#   define YY_(msgid) dgettext ("bison-runtime", msgid)
+#  endif
+# endif
+# ifndef YY_
+#  define YY_(msgid) msgid
+# endif
+#endif
+
+/* Suppress unused-variable warnings by "using" E.  */
+#if ! defined lint || defined __GNUC__
+# define YYUSE(e) ((void) (e))
+#else
+# define YYUSE(e) /* empty */
+#endif
+
+/* Identity function, used to suppress warnings about constant conditions.  */
+#ifndef lint
+# define YYID(n) (n)
+#else
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static int
+YYID (int yyi)
+#else
+static int
+YYID (yyi)
+    int yyi;
+#endif
+{
+  return yyi;
+}
+#endif
+
+#if ! defined yyoverflow || YYERROR_VERBOSE
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   elif defined __BUILTIN_VA_ARG_INCR
+#    include <alloca.h> /* INFRINGES ON USER NAME SPACE */
+#   elif defined _AIX
+#    define YYSTACK_ALLOC __alloca
+#   elif defined _MSC_VER
+#    include <malloc.h> /* INFRINGES ON USER NAME SPACE */
+#    define alloca _alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#    if ! defined _ALLOCA_H && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+#     include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#     ifndef _STDLIB_H
+#      define _STDLIB_H 1
+#     endif
+#    endif
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning.  */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0))
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+    /* The OS might guarantee only one guard page at the bottom of the stack,
+       and a page size can be as small as 4096 bytes.  So we cannot safely
+       invoke alloca (N) if N exceeds 4096.  Use a slightly smaller number
+       to allow for a few compiler-allocated temporary stack slots.  */
+#   define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */
+#  endif
+# else
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+#  ifndef YYSTACK_ALLOC_MAXIMUM
+#   define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM
+#  endif
+#  if (defined __cplusplus && ! defined _STDLIB_H \
+       && ! ((defined YYMALLOC || defined malloc) \
+	     && (defined YYFREE || defined free)))
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   ifndef _STDLIB_H
+#    define _STDLIB_H 1
+#   endif
+#  endif
+#  ifndef YYMALLOC
+#   define YYMALLOC malloc
+#   if ! defined malloc && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+#  ifndef YYFREE
+#   define YYFREE free
+#   if ! defined free && ! defined _STDLIB_H && (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+void free (void *); /* INFRINGES ON USER NAME SPACE */
+#   endif
+#  endif
+# endif
+#endif /* ! defined yyoverflow || YYERROR_VERBOSE */
+
+
+#if (! defined yyoverflow \
+     && (! defined __cplusplus \
+	 || (defined YYLTYPE_IS_TRIVIAL && YYLTYPE_IS_TRIVIAL \
+	     && defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  yytype_int16 yyss_alloc;
+  YYSTYPE yyvs_alloc;
+  YYLTYPE yyls_alloc;
+};
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE) + sizeof (YYLTYPE)) \
+      + 2 * YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined __GNUC__ && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  YYSIZE_T yyi;				\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (YYID (0))
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack_alloc, Stack)				\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack_alloc, Stack, yysize);			\
+	Stack = &yyptr->Stack_alloc;					\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (YYID (0))
+
+#endif
+
+/* YYFINAL -- State number of the termination state.  */
+#define YYFINAL  6
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   97
+
+/* YYNTOKENS -- Number of terminals.  */
+#define YYNTOKENS  34
+/* YYNNTS -- Number of nonterminals.  */
+#define YYNNTS  28
+/* YYNRULES -- Number of rules.  */
+#define YYNRULES  66
+/* YYNRULES -- Number of states.  */
+#define YYNSTATES  92
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   283
+
+#define YYTRANSLATE(YYX)						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const yytype_uint8 yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+      31,    32,     2,     2,    33,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,    29,     2,    30,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const yytype_uint8 yyprhs[] =
+{
+       0,     0,     3,     6,    10,    13,    16,    19,    21,    23,
+      25,    27,    29,    31,    33,    35,    37,    39,    41,    43,
+      45,    47,    49,    51,    53,    55,    57,    59,    61,    63,
+      65,    67,    69,    71,    73,    75,    77,    80,    82,    85,
+      87,    90,    92,    95,    97,   101,   102,   104,   106,   108,
+     111,   113,   116,   121,   123,   125,   127,   128,   132,   136,
+     141,   143,   145,   149,   151,   152,   158
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS.  */
+static const yytype_int8 yyrhs[] =
+{
+      35,     0,    -1,    36,    43,    -1,    37,    38,    39,    -1,
+       3,    27,    -1,     4,    28,    -1,     5,    26,    -1,    16,
+      -1,    17,    -1,    18,    -1,    19,    -1,    20,    -1,    21,
+      -1,    22,    -1,    23,    -1,    12,    -1,    13,    -1,    14,
+      -1,    15,    -1,    24,    -1,    25,    -1,    40,    -1,    10,
+      -1,     9,    -1,    40,    -1,    28,    -1,    27,    -1,    26,
+      -1,     7,    -1,     8,    -1,    11,    -1,    29,    -1,    30,
+      -1,    31,    -1,    32,    -1,    33,    -1,    43,    60,    -1,
+      60,    -1,    43,    55,    -1,    55,    -1,    43,    61,    -1,
+      61,    -1,    43,    42,    -1,    42,    -1,    29,    26,    30,
+      -1,    -1,    44,    -1,    28,    -1,     9,    -1,    10,    26,
+      -1,    41,    -1,    48,    41,    -1,    47,    49,    46,    45,
+      -1,    31,    -1,    32,    -1,    50,    -1,    -1,    53,    33,
+      50,    -1,    51,    53,    52,    -1,     6,     8,    46,    54,
+      -1,    31,    -1,    32,    -1,    56,    53,    57,    -1,    58,
+      -1,    -1,     6,     7,    59,    46,    54,    -1,     6,    11,
+      49,    46,    44,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const yytype_uint8 yyrline[] =
+{
+       0,    71,    71,    74,    81,    83,    85,    89,    89,    89,
+      89,    90,    90,    90,    90,    91,    91,    91,    91,    92,
+      92,    94,    97,    98,    99,   100,   100,   100,   101,   101,
+     102,   103,   104,   105,   106,   107,   110,   110,   111,   111,
+     112,   112,   113,   113,   117,   120,   121,   123,   124,   126,
+     127,   128,   130,   136,   137,   138,   139,   140,   141,   143,
+     148,   149,   150,   151,   151,   152,   158
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE || YYTOKEN_TABLE
+/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals.  */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "TOKEN_VERSION", "TOKEN_TARGET",
+  "TOKEN_ADDRESS_SIZE", "TOKEN_VISIBLE", "TOKEN_FUNC", "TOKEN_ENTRY",
+  "TOKEN_PARAM", "TOKEN_ALIGN", "TOKEN_GLOBAL", "TOKEN_B8", "TOKEN_B16",
+  "TOKEN_B32", "TOKEN_B64", "TOKEN_U8", "TOKEN_U16", "TOKEN_U32",
+  "TOKEN_U64", "TOKEN_S8", "TOKEN_S16", "TOKEN_S32", "TOKEN_S64",
+  "TOKEN_F32", "TOKEN_F64", "TOKEN_INT", "TOKEN_FLOAT", "TOKEN_STRING",
+  "'['", "']'", "'('", "')'", "','", "$accept", "ptxsource", "header",
+  "version", "target", "address_size", "dataTypeId", "dataType",
+  "anytoken", "ptxbody", "arrayDimensionSet", "arrayDimensions",
+  "identifier", "parameter", "alignment", "addressableVariablePrefix",
+  "argumentDeclaration", "argumentListBegin", "argumentListEnd",
+  "argumentListBody", "argumentList", "visibleEntryDeclaration",
+  "returnArgumentListBegin", "returnArgumentListEnd", "returnArgumentList",
+  "optionalReturnArgumentList", "visibleFunctionDeclaration",
+  "visibleInitializableDeclaration", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const yytype_uint16 yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,    91,
+      93,    40,    41,    44
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const yytype_uint8 yyr1[] =
+{
+       0,    34,    35,    36,    37,    38,    39,    40,    40,    40,
+      40,    40,    40,    40,    40,    40,    40,    40,    40,    40,
+      40,    41,    42,    42,    42,    42,    42,    42,    42,    42,
+      42,    42,    42,    42,    42,    42,    43,    43,    43,    43,
+      43,    43,    43,    43,    44,    45,    45,    46,    47,    48,
+      49,    49,    50,    51,    52,    53,    53,    53,    54,    55,
+      56,    57,    58,    59,    59,    60,    61
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const yytype_uint8 yyr2[] =
+{
+       0,     2,     2,     3,     2,     2,     2,     1,     1,     1,
+       1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     1,     1,     2,     1,     2,     1,
+       2,     1,     2,     1,     3,     0,     1,     1,     1,     2,
+       1,     2,     4,     1,     1,     1,     0,     3,     3,     4,
+       1,     1,     3,     1,     0,     5,     5
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const yytype_uint8 yydefact[] =
+{
+       0,     0,     0,     0,     0,     4,     1,     0,    28,    29,
+      23,    22,    30,    15,    16,    17,    18,     7,     8,     9,
+      10,    11,    12,    13,    14,    19,    20,    27,    26,    25,
+      31,    32,    33,    34,    35,    24,    43,     2,    39,    37,
+      41,     0,     0,    64,     0,     0,    42,    38,    36,    40,
+       5,     0,     3,    60,    56,    63,     0,    47,     0,     0,
+      21,    50,     0,     0,     6,    48,     0,    55,     0,     0,
+      53,    56,    59,    49,    51,     0,     0,    61,     0,    62,
+      65,     0,     0,    66,    45,    57,    54,    58,     0,    46,
+      52,    44
+};
+
+/* YYDEFGOTO[NTERM-NUM].  */
+static const yytype_int8 yydefgoto[] =
+{
+      -1,     2,     3,     4,    42,    52,    60,    61,    36,    37,
+      83,    90,    58,    66,    62,    63,    67,    71,    87,    68,
+      72,    38,    54,    79,    55,    56,    39,    40
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -28
+static const yytype_int8 yypact[] =
+{
+      27,     6,    41,    -5,    39,   -28,   -28,    24,   -28,   -28,
+     -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,
+     -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,   -28,
+     -28,   -28,   -28,   -28,   -28,   -28,   -28,    -5,   -28,   -28,
+     -28,    14,    40,    13,    18,    38,   -28,   -28,   -28,   -28,
+     -28,    21,   -28,   -28,    69,   -28,    18,   -28,    48,    54,
+     -28,   -28,    52,    18,   -28,   -28,    38,   -28,     5,    48,
+     -28,    69,   -28,   -28,   -28,    53,    18,   -28,    69,   -28,
+     -28,     7,    55,   -28,    53,   -28,   -28,   -28,    56,   -28,
+     -28,   -28
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const yytype_int8 yypgoto[] =
+{
+     -28,   -28,   -28,   -28,   -28,   -28,    -3,    22,    46,   -28,
+       1,   -28,   -27,   -28,   -28,    23,     9,   -28,   -28,    17,
+      25,    58,   -28,   -28,   -28,   -28,    59,    60
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const yytype_uint8 yytable[] =
+{
+      35,     7,     8,     9,    10,    11,    12,    13,    14,    15,
+      16,    17,    18,    19,    20,    21,    22,    23,    24,    25,
+      26,    27,    28,    29,    30,    31,    32,    33,    34,    69,
+       1,    43,    44,     5,    35,    45,    75,    77,    78,    86,
+      78,     6,    50,    41,    53,    51,    57,    64,    59,    84,
+      13,    14,    15,    16,    17,    18,    19,    20,    21,    22,
+      23,    24,    25,    26,    13,    14,    15,    16,    17,    18,
+      19,    20,    21,    22,    23,    24,    25,    26,    65,    70,
+      73,    88,    82,    46,    74,    89,    91,    85,    81,    76,
+       0,     0,     0,     0,    80,    47,    48,    49
+};
+
+static const yytype_int8 yycheck[] =
+{
+       3,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    56,
+       3,     7,     8,    27,    37,    11,    63,    32,    33,    32,
+      33,     0,    28,     4,    31,     5,    28,    26,    10,    76,
+      12,    13,    14,    15,    16,    17,    18,    19,    20,    21,
+      22,    23,    24,    25,    12,    13,    14,    15,    16,    17,
+      18,    19,    20,    21,    22,    23,    24,    25,     9,    31,
+      26,    26,    29,    37,    62,    84,    30,    78,    71,    66,
+      -1,    -1,    -1,    -1,    69,    37,    37,    37
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const yytype_uint8 yystos[] =
+{
+       0,     3,    35,    36,    37,    27,     0,     6,     7,     8,
+       9,    10,    11,    12,    13,    14,    15,    16,    17,    18,
+      19,    20,    21,    22,    23,    24,    25,    26,    27,    28,
+      29,    30,    31,    32,    33,    40,    42,    43,    55,    60,
+      61,     4,    38,     7,     8,    11,    42,    55,    60,    61,
+      28,     5,    39,    31,    56,    58,    59,    28,    46,    10,
+      40,    41,    48,    49,    26,     9,    47,    50,    53,    46,
+      31,    51,    54,    26,    41,    46,    49,    32,    33,    57,
+      54,    53,    29,    44,    46,    50,    32,    52,    26,    44,
+      45,    30
+};
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK (1);						\
+      goto yybackup;						\
+    }								\
+  else								\
+    {								\
+      yyerror (&yylloc, lexer, state, YY_("syntax error: cannot back up")); \
+      YYERROR;							\
+    }								\
+while (YYID (0))
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (YYID (N))                                                    \
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (YYID (0))
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+	      (Loc).first_line, (Loc).first_column,	\
+	      (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (&yylval, &yylloc, YYLEX_PARAM)
+#else
+# define YYLEX yylex (&yylval, &yylloc, lexer, state)
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (YYID (0))
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)			  \
+do {									  \
+  if (yydebug)								  \
+    {									  \
+      YYFPRINTF (stderr, "%s ", Title);					  \
+      yy_symbol_print (stderr,						  \
+		  Type, Value, Location, lexer, state); \
+      YYFPRINTF (stderr, "\n");						  \
+    }									  \
+} while (YYID (0))
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, YYLTYPE const * const yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state)
+#else
+static void
+yy_symbol_value_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+    YYLTYPE const * const yylocationp;
+    parser::PTXLexer& lexer;
+    parser::PTXParser& state;
+#endif
+{
+  if (!yyvaluep)
+    return;
+  YYUSE (yylocationp);
+  YYUSE (lexer);
+  YYUSE (state);
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# else
+  YYUSE (yyoutput);
+# endif
+  switch (yytype)
+    {
+      default:
+	break;
+    }
+}
+
+
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep, YYLTYPE const * const yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state)
+#else
+static void
+yy_symbol_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE const * const yyvaluep;
+    YYLTYPE const * const yylocationp;
+    parser::PTXLexer& lexer;
+    parser::PTXParser& state;
+#endif
+{
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+  YY_LOCATION_PRINT (yyoutput, *yylocationp);
+  YYFPRINTF (yyoutput, ": ");
+  yy_symbol_value_print (yyoutput, yytype, yyvaluep, yylocationp, lexer, state);
+  YYFPRINTF (yyoutput, ")");
+}
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop)
+#else
+static void
+yy_stack_print (yybottom, yytop)
+    yytype_int16 *yybottom;
+    yytype_int16 *yytop;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (; yybottom <= yytop; yybottom++)
+    {
+      int yybot = *yybottom;
+      YYFPRINTF (stderr, " %d", yybot);
+    }
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (YYID (0))
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yy_reduce_print (YYSTYPE *yyvsp, YYLTYPE *yylsp, int yyrule, parser::PTXLexer& lexer, parser::PTXParser& state)
+#else
+static void
+yy_reduce_print (yyvsp, yylsp, yyrule, lexer, state)
+    YYSTYPE *yyvsp;
+    YYLTYPE *yylsp;
+    int yyrule;
+    parser::PTXLexer& lexer;
+    parser::PTXParser& state;
+#endif
+{
+  int yynrhs = yyr2[yyrule];
+  int yyi;
+  unsigned long int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n",
+	     yyrule - 1, yylno);
+  /* The symbols being reduced.  */
+  for (yyi = 0; yyi < yynrhs; yyi++)
+    {
+      YYFPRINTF (stderr, "   $%d = ", yyi + 1);
+      yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi],
+		       &(yyvsp[(yyi + 1) - (yynrhs)])
+		       , &(yylsp[(yyi + 1) - (yynrhs)])		       , lexer, state);
+      YYFPRINTF (stderr, "\n");
+    }
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (yyvsp, yylsp, Rule, lexer, state); \
+} while (YYID (0))
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined __GLIBC__ && defined _STRING_H
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static YYSIZE_T
+yystrlen (const char *yystr)
+#else
+static YYSIZE_T
+yystrlen (yystr)
+    const char *yystr;
+#endif
+{
+  YYSIZE_T yylen;
+  for (yylen = 0; yystr[yylen]; yylen++)
+    continue;
+  return yylen;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static char *
+yystpcpy (char *yydest, const char *yysrc)
+#else
+static char *
+yystpcpy (yydest, yysrc)
+    char *yydest;
+    const char *yysrc;
+#endif
+{
+  char *yyd = yydest;
+  const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+# ifndef yytnamerr
+/* Copy to YYRES the contents of YYSTR after stripping away unnecessary
+   quotes and backslashes, so that it's suitable for yyerror.  The
+   heuristic is that double-quoting is unnecessary unless the string
+   contains an apostrophe, a comma, or backslash (other than
+   backslash-backslash).  YYSTR is taken from yytname.  If YYRES is
+   null, do not copy; instead, return the length of what the result
+   would have been.  */
+static YYSIZE_T
+yytnamerr (char *yyres, const char *yystr)
+{
+  if (*yystr == '"')
+    {
+      YYSIZE_T yyn = 0;
+      char const *yyp = yystr;
+
+      for (;;)
+	switch (*++yyp)
+	  {
+	  case '\'':
+	  case ',':
+	    goto do_not_strip_quotes;
+
+	  case '\\':
+	    if (*++yyp != '\\')
+	      goto do_not_strip_quotes;
+	    /* Fall through.  */
+	  default:
+	    if (yyres)
+	      yyres[yyn] = *yyp;
+	    yyn++;
+	    break;
+
+	  case '"':
+	    if (yyres)
+	      yyres[yyn] = '\0';
+	    return yyn;
+	  }
+    do_not_strip_quotes: ;
+    }
+
+  if (! yyres)
+    return yystrlen (yystr);
+
+  return yystpcpy (yyres, yystr) - yyres;
+}
+# endif
+
+/* Copy into YYRESULT an error message about the unexpected token
+   YYCHAR while in state YYSTATE.  Return the number of bytes copied,
+   including the terminating null byte.  If YYRESULT is null, do not
+   copy anything; just return the number of bytes that would be
+   copied.  As a special case, return 0 if an ordinary "syntax error"
+   message will do.  Return YYSIZE_MAXIMUM if overflow occurs during
+   size calculation.  */
+static YYSIZE_T
+yysyntax_error (char *yyresult, int yystate, int yychar)
+{
+  int yyn = yypact[yystate];
+
+  if (! (YYPACT_NINF < yyn && yyn <= YYLAST))
+    return 0;
+  else
+    {
+      int yytype = YYTRANSLATE (yychar);
+      YYSIZE_T yysize0 = yytnamerr (0, yytname[yytype]);
+      YYSIZE_T yysize = yysize0;
+      YYSIZE_T yysize1;
+      int yysize_overflow = 0;
+      enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 };
+      char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM];
+      int yyx;
+
+# if 0
+      /* This is so xgettext sees the translatable formats that are
+	 constructed on the fly.  */
+      YY_("syntax error, unexpected %s");
+      YY_("syntax error, unexpected %s, expecting %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s");
+      YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s");
+# endif
+      char *yyfmt;
+      char const *yyf;
+      static char const yyunexpected[] = "syntax error, unexpected %s";
+      static char const yyexpecting[] = ", expecting %s";
+      static char const yyor[] = " or %s";
+      char yyformat[sizeof yyunexpected
+		    + sizeof yyexpecting - 1
+		    + ((YYERROR_VERBOSE_ARGS_MAXIMUM - 2)
+		       * (sizeof yyor - 1))];
+      char const *yyprefix = yyexpecting;
+
+      /* Start YYX at -YYN if negative to avoid negative indexes in
+	 YYCHECK.  */
+      int yyxbegin = yyn < 0 ? -yyn : 0;
+
+      /* Stay within bounds of both yycheck and yytname.  */
+      int yychecklim = YYLAST - yyn + 1;
+      int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+      int yycount = 1;
+
+      yyarg[0] = yytname[yytype];
+      yyfmt = yystpcpy (yyformat, yyunexpected);
+
+      for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	  {
+	    if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM)
+	      {
+		yycount = 1;
+		yysize = yysize0;
+		yyformat[sizeof yyunexpected - 1] = '\0';
+		break;
+	      }
+	    yyarg[yycount++] = yytname[yyx];
+	    yysize1 = yysize + yytnamerr (0, yytname[yyx]);
+	    yysize_overflow |= (yysize1 < yysize);
+	    yysize = yysize1;
+	    yyfmt = yystpcpy (yyfmt, yyprefix);
+	    yyprefix = yyor;
+	  }
+
+      yyf = YY_(yyformat);
+      yysize1 = yysize + yystrlen (yyf);
+      yysize_overflow |= (yysize1 < yysize);
+      yysize = yysize1;
+
+      if (yysize_overflow)
+	return YYSIZE_MAXIMUM;
+
+      if (yyresult)
+	{
+	  /* Avoid sprintf, as that infringes on the user's name space.
+	     Don't have undefined behavior even if the translation
+	     produced a string with the wrong number of "%s"s.  */
+	  char *yyp = yyresult;
+	  int yyi = 0;
+	  while ((*yyp = *yyf) != '\0')
+	    {
+	      if (*yyp == '%' && yyf[1] == 's' && yyi < yycount)
+		{
+		  yyp += yytnamerr (yyp, yyarg[yyi++]);
+		  yyf += 2;
+		}
+	      else
+		{
+		  yyp++;
+		  yyf++;
+		}
+	    }
+	}
+      return yysize;
+    }
+}
+#endif /* YYERROR_VERBOSE */
+
+
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+/*ARGSUSED*/
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep, YYLTYPE *yylocationp, parser::PTXLexer& lexer, parser::PTXParser& state)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep, yylocationp, lexer, state)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+    YYLTYPE *yylocationp;
+    parser::PTXLexer& lexer;
+    parser::PTXParser& state;
+#endif
+{
+  YYUSE (yyvaluep);
+  YYUSE (yylocationp);
+  YYUSE (lexer);
+  YYUSE (state);
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+	break;
+    }
+}
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+#ifdef YYPARSE_PARAM
+#if defined __STDC__ || defined __cplusplus
+int yyparse (void *YYPARSE_PARAM);
+#else
+int yyparse ();
+#endif
+#else /* ! YYPARSE_PARAM */
+#if defined __STDC__ || defined __cplusplus
+int yyparse (parser::PTXLexer& lexer, parser::PTXParser& state);
+#else
+int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+
+
+/*-------------------------.
+| yyparse or yypush_parse.  |
+`-------------------------*/
+
+#ifdef YYPARSE_PARAM
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (void *YYPARSE_PARAM)
+#else
+int
+yyparse (YYPARSE_PARAM)
+    void *YYPARSE_PARAM;
+#endif
+#else /* ! YYPARSE_PARAM */
+#if (defined __STDC__ || defined __C99__FUNC__ \
+     || defined __cplusplus || defined _MSC_VER)
+int
+yyparse (parser::PTXLexer& lexer, parser::PTXParser& state)
+#else
+int
+yyparse (lexer, state)
+    parser::PTXLexer& lexer;
+    parser::PTXParser& state;
+#endif
+#endif
+{
+/* The lookahead symbol.  */
+int yychar;
+
+/* The semantic value of the lookahead symbol.  */
+YYSTYPE yylval;
+
+/* Location data for the lookahead symbol.  */
+YYLTYPE yylloc;
+
+    /* Number of syntax errors so far.  */
+    int yynerrs;
+
+    int yystate;
+    /* Number of tokens to shift before error messages enabled.  */
+    int yyerrstatus;
+
+    /* The stacks and their tools:
+       `yyss': related to states.
+       `yyvs': related to semantic values.
+       `yyls': related to locations.
+
+       Refer to the stacks thru separate pointers, to allow yyoverflow
+       to reallocate them elsewhere.  */
+
+    /* The state stack.  */
+    yytype_int16 yyssa[YYINITDEPTH];
+    yytype_int16 *yyss;
+    yytype_int16 *yyssp;
+
+    /* The semantic value stack.  */
+    YYSTYPE yyvsa[YYINITDEPTH];
+    YYSTYPE *yyvs;
+    YYSTYPE *yyvsp;
+
+    /* The location stack.  */
+    YYLTYPE yylsa[YYINITDEPTH];
+    YYLTYPE *yyls;
+    YYLTYPE *yylsp;
+
+    /* The locations where the error started and ended.  */
+    YYLTYPE yyerror_range[2];
+
+    YYSIZE_T yystacksize;
+
+  int yyn;
+  int yyresult;
+  /* Lookahead token as an internal (translated) token number.  */
+  int yytoken;
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+  YYLTYPE yyloc;
+
+#if YYERROR_VERBOSE
+  /* Buffer for error messages, and its allocated size.  */
+  char yymsgbuf[128];
+  char *yymsg = yymsgbuf;
+  YYSIZE_T yymsg_alloc = sizeof yymsgbuf;
+#endif
+
+#define YYPOPSTACK(N)   (yyvsp -= (N), yyssp -= (N), yylsp -= (N))
+
+  /* The number of symbols on the RHS of the reduced rule.
+     Keep to zero when no symbol should be popped.  */
+  int yylen = 0;
+
+  yytoken = 0;
+  yyss = yyssa;
+  yyvs = yyvsa;
+  yyls = yylsa;
+  yystacksize = YYINITDEPTH;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY; /* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+  yyssp = yyss;
+  yyvsp = yyvs;
+  yylsp = yyls;
+
+#if YYLTYPE_IS_TRIVIAL
+  /* Initialize the default location before parsing starts.  */
+  yylloc.first_line   = yylloc.last_line   = 1;
+  yylloc.first_column = yylloc.last_column = 1;
+#endif
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed.  So pushing a state here evens the stacks.  */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack.  Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	yytype_int16 *yyss1 = yyss;
+	YYLTYPE *yyls1 = yyls;
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow (YY_("memory exhausted"),
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+		    &yyls1, yysize * sizeof (*yylsp),
+		    &yystacksize);
+
+	yyls = yyls1;
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyexhaustedlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyexhaustedlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	yytype_int16 *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyexhaustedlab;
+	YYSTACK_RELOCATE (yyss_alloc, yyss);
+	YYSTACK_RELOCATE (yyvs_alloc, yyvs);
+	YYSTACK_RELOCATE (yyls_alloc, yyls);
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+      yylsp = yyls + yysize - 1;
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  if (yystate == YYFINAL)
+    YYACCEPT;
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+  /* Do appropriate processing given the current state.  Read a
+     lookahead token if we need one and don't already have one.  */
+
+  /* First try to decide what to do without reference to lookahead token.  */
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a lookahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  /* Shift the lookahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the shifted token.  */
+  yychar = YYEMPTY;
+
+  yystate = yyn;
+  *++yyvsp = yylval;
+  *++yylsp = yylloc;
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+  /* Default location.  */
+  YYLLOC_DEFAULT (yyloc, (yylsp - yylen), yylen);
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 3:
+
+/* Line 1455 of yacc.c  */
+#line 75 "ptxgrammar.yy"
+    { 
+  std::cerr << "Done reading PTX \n" << std::endl; 
+   state.printHeader(); 
+;}
+    break;
+
+  case 4:
+
+/* Line 1455 of yacc.c  */
+#line 81 "ptxgrammar.yy"
+    { std::cerr << "Reading PTX version " << (yyvsp[(2) - (2)].fvalue)  << std::endl; ;}
+    break;
+
+  case 5:
+
+/* Line 1455 of yacc.c  */
+#line 83 "ptxgrammar.yy"
+    { std::cerr << "Target " << (yyvsp[(2) - (2)].svalue)  << std::endl; ;}
+    break;
+
+  case 6:
+
+/* Line 1455 of yacc.c  */
+#line 85 "ptxgrammar.yy"
+    { std::cerr << "Address_Size " << (yyvsp[(2) - (2)].ivalue)  << std::endl; ;}
+    break;
+
+  case 21:
+
+/* Line 1455 of yacc.c  */
+#line 94 "ptxgrammar.yy"
+    { state.dataTypeId((yyvsp[(1) - (1)].ivalue)); ;}
+    break;
+
+  case 44:
+
+/* Line 1455 of yacc.c  */
+#line 117 "ptxgrammar.yy"
+    { (yyval.ivalue) = (yyvsp[(2) - (3)].ivalue); state.arrayDimensions((yyvsp[(2) - (3)].ivalue)); ;}
+    break;
+
+  case 47:
+
+/* Line 1455 of yacc.c  */
+#line 123 "ptxgrammar.yy"
+    { strcpy((yyval.svalue), (yyvsp[(1) - (1)].svalue)); state.identifier((yyvsp[(1) - (1)].svalue)); ;}
+    break;
+
+  case 49:
+
+/* Line 1455 of yacc.c  */
+#line 126 "ptxgrammar.yy"
+    {(yyval.ivalue) = (yyvsp[(2) - (2)].ivalue); state.alignment((yyvsp[(2) - (2)].ivalue));;}
+    break;
+
+  case 50:
+
+/* Line 1455 of yacc.c  */
+#line 127 "ptxgrammar.yy"
+    { state.alignment(0); ;}
+    break;
+
+  case 52:
+
+/* Line 1455 of yacc.c  */
+#line 131 "ptxgrammar.yy"
+    {
+  state.argumentDeclaration((yylsp[(1) - (4)]));
+;}
+    break;
+
+  case 53:
+
+/* Line 1455 of yacc.c  */
+#line 136 "ptxgrammar.yy"
+    { state.argumentListBegin((yylsp[(1) - (1)])); ;}
+    break;
+
+  case 54:
+
+/* Line 1455 of yacc.c  */
+#line 137 "ptxgrammar.yy"
+    {state.argumentListEnd((yylsp[(1) - (1)])); ;}
+    break;
+
+  case 59:
+
+/* Line 1455 of yacc.c  */
+#line 144 "ptxgrammar.yy"
+    {
+   state.visibleEntryDeclaration((yyvsp[(3) - (4)].svalue), (yylsp[(1) - (4)]));
+;}
+    break;
+
+  case 60:
+
+/* Line 1455 of yacc.c  */
+#line 148 "ptxgrammar.yy"
+    { state.returnArgumentListBegin((yylsp[(1) - (1)])); ;}
+    break;
+
+  case 61:
+
+/* Line 1455 of yacc.c  */
+#line 149 "ptxgrammar.yy"
+    {state.returnArgumentListEnd((yylsp[(1) - (1)])); ;}
+    break;
+
+  case 65:
+
+/* Line 1455 of yacc.c  */
+#line 153 "ptxgrammar.yy"
+    {
+   state.visibleFunctionDeclaration((yyvsp[(4) - (5)].svalue), (yylsp[(1) - (5)]));
+;}
+    break;
+
+  case 66:
+
+/* Line 1455 of yacc.c  */
+#line 159 "ptxgrammar.yy"
+    {
+  state.visibleInitializableDeclaration((yyvsp[(4) - (5)].svalue),(yylsp[(1) - (5)]));
+;}
+    break;
+
+
+
+/* Line 1455 of yacc.c  */
+#line 1664 "ptxgrammar.cc"
+      default: break;
+    }
+  YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc);
+
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+  *++yylsp = yyloc;
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if ! YYERROR_VERBOSE
+      yyerror (&yylloc, lexer, state, YY_("syntax error"));
+#else
+      {
+	YYSIZE_T yysize = yysyntax_error (0, yystate, yychar);
+	if (yymsg_alloc < yysize && yymsg_alloc < YYSTACK_ALLOC_MAXIMUM)
+	  {
+	    YYSIZE_T yyalloc = 2 * yysize;
+	    if (! (yysize <= yyalloc && yyalloc <= YYSTACK_ALLOC_MAXIMUM))
+	      yyalloc = YYSTACK_ALLOC_MAXIMUM;
+	    if (yymsg != yymsgbuf)
+	      YYSTACK_FREE (yymsg);
+	    yymsg = (char *) YYSTACK_ALLOC (yyalloc);
+	    if (yymsg)
+	      yymsg_alloc = yyalloc;
+	    else
+	      {
+		yymsg = yymsgbuf;
+		yymsg_alloc = sizeof yymsgbuf;
+	      }
+	  }
+
+	if (0 < yysize && yysize <= yymsg_alloc)
+	  {
+	    (void) yysyntax_error (yymsg, yystate, yychar);
+	    yyerror (&yylloc, lexer, state, yymsg);
+	  }
+	else
+	  {
+	    yyerror (&yylloc, lexer, state, YY_("syntax error"));
+	    if (yysize != 0)
+	      goto yyexhaustedlab;
+	  }
+      }
+#endif
+    }
+
+  yyerror_range[0] = yylloc;
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse lookahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+	{
+	  /* Return failure if at end of input.  */
+	  if (yychar == YYEOF)
+	    YYABORT;
+	}
+      else
+	{
+	  yydestruct ("Error: discarding",
+		      yytoken, &yylval, &yylloc, lexer, state);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse lookahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+  /* Pacify compilers like GCC when the user code never invokes
+     YYERROR and the label yyerrorlab therefore never appears in user
+     code.  */
+  if (/*CONSTCOND*/ 0)
+     goto yyerrorlab;
+
+  yyerror_range[0] = yylsp[1-yylen];
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYERROR.  */
+  YYPOPSTACK (yylen);
+  yylen = 0;
+  YY_STACK_PRINT (yyss, yyssp);
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+      yyerror_range[0] = *yylsp;
+      yydestruct ("Error: popping",
+		  yystos[yystate], yyvsp, yylsp, lexer, state);
+      YYPOPSTACK (1);
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  *++yyvsp = yylval;
+
+  yyerror_range[1] = yylloc;
+  /* Using YYLLOC is tempting, but would change the location of
+     the lookahead.  YYLOC is available though.  */
+  YYLLOC_DEFAULT (yyloc, (yyerror_range - 1), 2);
+  *++yylsp = yyloc;
+
+  /* Shift the error token.  */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yyresult = 1;
+  goto yyreturn;
+
+#if !defined(yyoverflow) || YYERROR_VERBOSE
+/*-------------------------------------------------.
+| yyexhaustedlab -- memory exhaustion comes here.  |
+`-------------------------------------------------*/
+yyexhaustedlab:
+  yyerror (&yylloc, lexer, state, YY_("memory exhausted"));
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+  if (yychar != YYEMPTY)
+     yydestruct ("Cleanup: discarding lookahead",
+		 yytoken, &yylval, &yylloc, lexer, state);
+  /* Do not reclaim the symbols of the rule which action triggered
+     this YYABORT or YYACCEPT.  */
+  YYPOPSTACK (yylen);
+  YY_STACK_PRINT (yyss, yyssp);
+  while (yyssp != yyss)
+    {
+      yydestruct ("Cleanup: popping",
+		  yystos[*yyssp], yyvsp, yylsp, lexer, state);
+      YYPOPSTACK (1);
+    }
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+#if YYERROR_VERBOSE
+  if (yymsg != yymsgbuf)
+    YYSTACK_FREE (yymsg);
+#endif
+  /* Make sure YYID is used.  */
+  return YYID (yyresult);
+}
+
+
+
+/* Line 1675 of yacc.c  */
+#line 164 "ptxgrammar.yy"
+
+
+int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state )
+{
+	lexer.yylval = token;
+	
+	int tokenValue         = lexer.yylexPosition();
+	location->first_line   = lexer.lineno();
+	location->first_column = lexer.column;
+
+#if 0
+	report( " Lexer (" << location->first_line << ","
+		<< location->first_column 
+		<< "): " << parser::PTXLexer::toString( tokenValue ) << " \"" 
+		<< lexer.YYText() << "\"");
+#endif
+	
+	return tokenValue;
+}
+	
+static std::string toString( YYLTYPE& location, parser::PTXParser& state )
+{
+  std::stringstream stream;
+  stream 
+#if 0
+  << state.fileName 
+#else
+  << "ptx "
+#endif
+  << " (" << location.first_line << ", " 
+    << location.first_column << "): ";
+  return stream.str();
+}
+
+void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state, char const* message )
+{
+	std::stringstream stream;
+	stream << toString( *location, state ) 
+		<< " " << message;
+  fprintf(stderr, "--ERROR-- %s %s \n", toString(*location, state).c_str(), message);
+  assert(0);
+}
+
+}
+
diff --git a/ptxtools/ptxgrammar.yy b/ptxtools/ptxgrammar.yy
new file mode 100644
index 00000000..482340af
--- /dev/null
+++ b/ptxtools/ptxgrammar.yy
@@ -0,0 +1,215 @@
+%locations
+
+%{
+	#include <iostream>
+	#include "PTXParser.h"
+	#include "PTXLexer.h"
+	#include <cassert>
+	#include <cstring>
+  #include <sstream>
+  #include <cstdio>
+
+	#define YYERROR_VERBOSE 1
+
+	#ifdef REPORT_BASE
+	#undef REPORT_BASE
+	#endif
+
+	#define REPORT_BASE 0
+
+	namespace ptx
+	{
+	
+    int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state );
+    void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+      parser::PTXParser& state, char const* message );
+    
+    std::string yyTypeToString( int );
+	
+%}
+
+%union
+{
+	char         svalue[1024];
+	double       fvalue;
+	int          ivalue;
+	unsigned int uvalue;
+}
+
+%parse-param {parser::PTXLexer& lexer}
+%parse-param {parser::PTXParser& state}
+%lex-param   {parser::PTXLexer& lexer}
+%lex-param   {parser::PTXParser& state}
+%pure-parser
+
+// define the constant-string tokens:
+%token TOKEN_VERSION TOKEN_TARGET TOKEN_ADDRESS_SIZE
+%token TOKEN_VISIBLE TOKEN_FUNC TOKEN_ENTRY
+%token TOKEN_PARAM TOKEN_ALIGN 
+%token TOKEN_GLOBAL
+%token<ivalue> TOKEN_B8 TOKEN_B16 TOKEN_B32 TOKEN_B64
+%token<ivalue> TOKEN_U8 TOKEN_U16 TOKEN_U32 TOKEN_U64
+%token<ivalue> TOKEN_S8 TOKEN_S16 TOKEN_S32 TOKEN_S64
+%token<ivalue> TOKEN_F32 TOKEN_F64
+
+// define the "terminal symbol" token types I'm going to use (in CAPS
+// by convention), and associate each with a field of the union:
+%token <ivalue> TOKEN_INT
+%token <fvalue> TOKEN_FLOAT
+%token <svalue> TOKEN_STRING
+
+%type<svalue> identifier
+%type<ivalue> arrayDimensionSet
+%type<ivalue> alignment
+
+%start ptxsource
+
+%%
+// the first rule defined is the highest-level rule, which in our
+// case is just the concept of a whole "snazzle file":
+ptxsource:
+  header ptxbody;
+
+header:
+  version target  address_size 
+{ 
+//  std::cerr << "Done reading PTX \n" << std::endl; 
+   state.printHeader(); 
+};
+
+version:
+  TOKEN_VERSION TOKEN_FLOAT  { assert($2 >= 3.0); } ;//std::cerr << "Reading PTX version " << $2  << std::endl; };
+target:
+  TOKEN_TARGET TOKEN_STRING  { assert(std::string($2) == std::string("sm_35")); } //std::cerr << "Target " << $2  << std::endl; };
+address_size:
+  TOKEN_ADDRESS_SIZE TOKEN_INT  { assert($2 == 64); } //std::cerr << "Address_Size " << $2  << std::endl; };
+
+
+dataTypeId : 
+    TOKEN_U8 | TOKEN_U16 | TOKEN_U32 | TOKEN_U64 
+  | TOKEN_S8 | TOKEN_S16 | TOKEN_S32 | TOKEN_S64 
+  | TOKEN_B8 | TOKEN_B16 | TOKEN_B32 | TOKEN_B64 
+  | TOKEN_F32 | TOKEN_F64;
+
+dataType: dataTypeId { state.dataTypeId($<ivalue>1); }
+
+anytoken: 
+  TOKEN_ALIGN 
+| TOKEN_PARAM 
+| dataTypeId
+| TOKEN_STRING | TOKEN_FLOAT | TOKEN_INT
+| TOKEN_FUNC | TOKEN_ENTRY
+| TOKEN_GLOBAL
+| '['
+| ']'
+| '('
+| ')'
+| ','
+| ';'
+| '='
+;
+
+ptxbody: 
+    ptxbody visibleFunctionDeclaration | visibleFunctionDeclaration
+  | ptxbody visibleEntryDeclaration| visibleEntryDeclaration
+  | ptxbody visibleInitializableDeclaration| visibleInitializableDeclaration
+  | ptxbody anytoken | anytoken;
+
+
+
+arrayDimensionSet : '[' TOKEN_INT ']' { $$ = $2; state.arrayDimensions($<ivalue>2); }
+// arrayDimensionSet : arrayDimensionSet '[' TOKEN_INT ']' { $$ = $2; }
+// arrayDimensionSet : '[' ']' { $$ = 0; }
+arrayDimensions : /* empty string */;
+arrayDimensions : arrayDimensionSet;
+
+identifier: TOKEN_STRING { strcpy($$, $1); state.identifier($1); }
+parameter : TOKEN_PARAM;
+
+alignment : TOKEN_ALIGN TOKEN_INT {$$ = $2; state.alignment($<ivalue>2);}
+addressableVariablePrefix : dataType { state.alignment(0); }
+addressableVariablePrefix : alignment dataType;
+
+argumentDeclaration : parameter addressableVariablePrefix identifier arrayDimensions
+{
+  state.argumentDeclaration(@1);
+}
+
+
+argumentListBegin : '(' { state.argumentListBegin(@1); };
+argumentListEnd : ')' {state.argumentListEnd(@1); };
+argumentListBody : argumentDeclaration;
+argumentListBody : /* empty string */;
+argumentListBody : argumentListBody ',' argumentDeclaration;
+argumentList: argumentListBegin argumentListBody argumentListEnd;
+
+visibleEntryDeclaration: TOKEN_VISIBLE TOKEN_ENTRY identifier argumentList
+{
+   state.visibleEntryDeclaration($<svalue>3, @1);
+};
+
+returnArgumentListBegin : '(' { state.returnArgumentListBegin(@1); }
+returnArgumentListEnd : ')' {state.returnArgumentListEnd(@1); }
+returnArgumentList : returnArgumentListBegin argumentListBody returnArgumentListEnd;
+optionalReturnArgumentList : returnArgumentList | /* empty string */;
+visibleFunctionDeclaration: TOKEN_VISIBLE TOKEN_FUNC optionalReturnArgumentList identifier argumentList
+{
+   state.visibleFunctionDeclaration($<svalue>4, @1);
+};
+
+visibleInitializableDeclaration :
+  TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier arrayDimensionSet
+  { state.visibleInitializableDeclaration($<svalue>4,@1); }
+| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier ';'
+  {state.arrayDimensions(0); state.visibleInitializableDeclaration($<svalue>4,@1); }
+| TOKEN_VISIBLE TOKEN_GLOBAL addressableVariablePrefix identifier '='
+  {state.arrayDimensions(0); state.visibleInitializableDeclaration($<svalue>4,@1); }
+
+
+%%
+
+int yylex( YYSTYPE* token, YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state )
+{
+	lexer.yylval = token;
+	
+	int tokenValue         = lexer.yylexPosition();
+	location->first_line   = lexer.lineno();
+	location->first_column = lexer.column;
+
+#if 0
+	report( " Lexer (" << location->first_line << ","
+		<< location->first_column 
+		<< "): " << parser::PTXLexer::toString( tokenValue ) << " \"" 
+		<< lexer.YYText() << "\"");
+#endif
+	
+	return tokenValue;
+}
+	
+static std::string toString( YYLTYPE& location, parser::PTXParser& state )
+{
+  std::stringstream stream;
+  stream 
+#if 0
+  << state.fileName 
+#else
+  << "ptx "
+#endif
+  << " (" << location.first_line << ", " 
+    << location.first_column << "): ";
+  return stream.str();
+}
+
+void yyerror( YYLTYPE* location, parser::PTXLexer& lexer, 
+	parser::PTXParser& state, char const* message )
+{
+	std::stringstream stream;
+	stream << toString( *location, state ) 
+		<< " " << message;
+  fprintf(stderr, "--Parser ERROR-- %s %s \n", toString(*location, state).c_str(), message);
+  exit(-1);
+}
+
+}
diff --git a/run_tests.py b/run_tests.py
index 671ad416..081c85b5 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -258,7 +258,7 @@ def run_test(testname):
                 if should_fail:
                     cc_cmd += " -DEXPECT_FAILURE"
                 if is_nvptx_target:
-                  nvptxcc_exe = "nvptxcc"
+                  nvptxcc_exe = "ptxtools/alloy_ptxcc.sh"
                   nvptxcc_exe_rel = add_prefix(nvptxcc_exe)
                   cc_cmd = "%s %s -DTEST_SIG=%d -o %s" % \
                       (nvptxcc_exe_rel, obj_name, match, exe_name)
@@ -270,7 +270,7 @@ def run_test(testname):
             if is_generic_target:
                 ispc_cmd += " --emit-c++ --c++-include-file=%s" % add_prefix(options.include_file)
             if is_nvptx_target:
-                filename4ptx = filename+".ptx.parsed_ispc"
+                filename4ptx = "/tmp/"+os.path.basename(filename)+".parsed.ispc"
                 grep_cmd = "grep -v 'export uniform int width' %s > %s " % \
                     (filename, filename4ptx)
                 if options.verbose: