ispc/cbackend.cpp

//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This library converts LLVM code to C code, compilable by GCC and other C
// compilers.
//
//===----------------------------------------------------------------------===//

#include "ispc.h"
#include "module.h"

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <sstream>

#ifndef _MSC_VER
#include <inttypes.h>
#define HAVE_PRINTF_A  1
#define ENABLE_CBE_PRINTF_A 1
#endif

#ifndef PRIx64
#define PRIx64 "llx"
#endif

#include "llvmutil.h"

#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  #include "llvm/Constants.h"
  #include "llvm/DerivedTypes.h"
  #include "llvm/CallingConv.h"
  #include "llvm/Module.h"
  #include "llvm/Instructions.h"
  #include "llvm/Intrinsics.h"
  #include "llvm/IntrinsicInst.h"
  #include "llvm/InlineAsm.h"
#else // LLVM 3.3+
  #include "llvm/IR/Constants.h"
  #include "llvm/IR/DerivedTypes.h"
  #include "llvm/IR/CallingConv.h"
  #include "llvm/IR/Module.h"
  #include "llvm/IR/Instructions.h"
  #include "llvm/IR/Intrinsics.h"
  #include "llvm/IR/IntrinsicInst.h"
  #include "llvm/IR/InlineAsm.h"
#endif
#include "llvm/Pass.h"
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // <= 3.6
  #include "llvm/PassManager.h"
#else // LLVM 3.7+
  #include "llvm/IR/LegacyPassManager.h"
#endif
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  #include "llvm/TypeFinder.h"
#else // LLVM_3_3+
  #include "llvm/IR/TypeFinder.h"
#endif
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/STLExtras.h"
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4 // 3.2, 3.3, 3.4
  #include "llvm/Support/InstIterator.h"
#else // 3.5+
  #include "llvm/IR/InstIterator.h"
#endif
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5
  #include "llvm/Analysis/FindUsedTypes.h"
#endif
#include "llvm/Analysis/LoopInfo.h"
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5
    #include "llvm/IR/Verifier.h"
    #include <llvm/IR/IRPrintingPasses.h>
    #include "llvm/IR/CallSite.h"
    #include "llvm/IR/CFG.h"
    #include "llvm/IR/GetElementPtrTypeIterator.h"
    #include "llvm/Support/FileSystem.h"
#else
    #include "llvm/Analysis/Verifier.h"
    #include <llvm/Assembly/PrintModulePass.h>
    #include "llvm/Support/CallSite.h"
    #include "llvm/Support/CFG.h"
    #include "llvm/Support/GetElementPtrTypeIterator.h"
#endif
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
//#include "llvm/Target/Mangler.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCObjectFileInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/MCSymbol.h"
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2 // 3.2
  #include "llvm/DataLayout.h"
#else // LLVM 3.3+
  #include "llvm/IR/DataLayout.h"
#endif
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/FormattedStream.h"
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2 // 3.2
  #include "llvm/Support/InstVisitor.h"
#elif ISPC_LLVM_VERSION <= ISPC_LLVM_3_4 // 3.3, 3.4
  #include "llvm/InstVisitor.h"
#else // LLVM 3.5+
  #include "llvm/IR/InstVisitor.h"
#endif
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/Host.h"
#include "llvm/Target/TargetMachine.h"

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4 // 3.2, 3.3, 3.4
    #include "llvm/Config/config.h"
#endif

#include <llvm/Transforms/IPO.h>
#include <llvm/Transforms/Utils/BasicBlockUtils.h>
#include <llvm/Support/ToolOutputFile.h>
#include <algorithm>
// Some ms header decided to define setjmp as _setjmp, undo this for this file.
#ifdef _MSC_VER
#undef setjmp
#define snprintf _snprintf
#endif
///////////////////////////////////////////////////////////////////////////////
// This part of code was in LLVM's ConstantsScanner.h,
// but it was removed in revision #232397

namespace constant_scanner {
class constant_iterator : public std::iterator<std::forward_iterator_tag,
                                               const llvm::Constant, ptrdiff_t> {
  llvm::const_inst_iterator InstI;         // Method instruction iterator
  unsigned OpIdx;                           // Operand index

  bool isAtConstant() const {
    assert(!InstI.atEnd() && OpIdx < InstI->getNumOperands() &&
           "isAtConstant called with invalid arguments!");
    return llvm::isa<llvm::Constant>(InstI->getOperand(OpIdx));
  }

public:
  constant_iterator(const llvm::Function *F) : InstI(llvm::inst_begin(F)), OpIdx(0) {
    // Advance to first constant... if we are not already at constant or end
    if (InstI != llvm::inst_end(F) &&                            // InstI is valid?
        (InstI->getNumOperands() == 0 || !isAtConstant())) // Not at constant?
      operator++();
  }

  constant_iterator(const llvm::Function *F, bool) // end ctor
      : InstI(llvm::inst_end(F)),
        OpIdx(0) {}

  bool operator==(const constant_iterator &x) const {
    return OpIdx == x.OpIdx && InstI == x.InstI;
  }
  bool operator!=(const constant_iterator &x) const { return !(*this == x); }

  pointer operator*() const {
    assert(isAtConstant() && "Dereferenced an iterator at the end!");
    return llvm::cast<llvm::Constant>(InstI->getOperand(OpIdx));
  }

  constant_iterator &operator++() { // Preincrement implementation
    ++OpIdx;
    do {
      unsigned NumOperands = InstI->getNumOperands();
      while (OpIdx < NumOperands && !isAtConstant()) {
        ++OpIdx;
      }

      if (OpIdx < NumOperands) return *this;  // Found a constant!
      ++InstI;
      OpIdx = 0;
    } while (!InstI.atEnd());

    return *this;  // At the end of the method
  }
};

inline constant_iterator constant_begin(const llvm::Function *F) {
  return constant_iterator(F);
}

inline constant_iterator constant_end(const llvm::Function *F) {
  return constant_iterator(F, true);
}

}

///////////////////////////////////////////////////////////////////////////////
// FIXME:
namespace {
  /// TypeFinder - Walk over a module, identifying all of the types that are
  /// used by the module.
  class TypeFinder {
    // To avoid walking constant expressions multiple times and other IR
    // objects, we keep several helper maps.
    llvm::DenseSet<const llvm::Value*> VisitedConstants;
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6 // LLVM 3.6+
    llvm::DenseSet<const llvm::Metadata*> VisitedMDNodes;
#endif
    llvm::DenseSet<llvm::Type*> VisitedTypes;
    std::vector<llvm::ArrayType*> &ArrayTypes;
    std::vector<llvm::IntegerType*> &IntegerTypes;
    std::vector<bool> &IsVolatile;
    std::vector<int> &Alignment;
  public:
    TypeFinder(std::vector<llvm::ArrayType*> &t, std::vector<llvm::IntegerType*> &i,
              std::vector<bool> &v, std::vector<int> &a)
      : ArrayTypes(t), IntegerTypes(i) , IsVolatile(v), Alignment(a){}

    void run(const llvm::Module &M) {
      // Get types from global variables.
      for (llvm::Module::const_global_iterator I = M.global_begin(),
           E = M.global_end(); I != E; ++I) {
        incorporateType(I->getType());
        if (I->hasInitializer())
          incorporateValue(I->getInitializer());
      }

      // Get types from aliases.
      for (llvm::Module::const_alias_iterator I = M.alias_begin(),
           E = M.alias_end(); I != E; ++I) {
        incorporateType(I->getType());
        if (const llvm::Value *Aliasee = I->getAliasee())
          incorporateValue(Aliasee);
      }

      llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 4> MDForInst;

      // Get types from functions.
      for (llvm::Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
        incorporateType(FI->getType());

        for (llvm::Function::const_iterator BB = FI->begin(), E = FI->end();
             BB != E;++BB)
          for (llvm::BasicBlock::const_iterator II = BB->begin(),
               E = BB->end(); II != E; ++II) {
            const llvm::Instruction &I = *II;

            // Operands of SwitchInsts changed format after 3.1
            // Seems like there ought to be better way to do what we
            // want here.  For now, punt on SwitchInsts.
            if (llvm::isa<llvm::SwitchInst>(&I)) continue;

            // Incorporate the type of the instruction and all its operands.
            incorporateType(I.getType());
            if (llvm::isa<llvm::StoreInst>(&I))
              if (llvm::IntegerType *ITy = llvm::dyn_cast<llvm::IntegerType>(I.getType())) {
                IntegerTypes.push_back(ITy);
                const llvm::StoreInst *St = llvm::dyn_cast<llvm::StoreInst>(&I);
                IsVolatile.push_back(St->isVolatile());
                Alignment.push_back(St->getAlignment());
              }

            if (llvm::isa<llvm::LoadInst>(&I))
              if (llvm::IntegerType *ITy = llvm::dyn_cast<llvm::IntegerType>(I.getType())) {
                IntegerTypes.push_back(ITy);
                const llvm::LoadInst *St = llvm::dyn_cast<llvm::LoadInst>(&I);
                IsVolatile.push_back(St->isVolatile());
                Alignment.push_back(St->getAlignment());
              }

            for (llvm::User::const_op_iterator OI = I.op_begin(), OE = I.op_end();
                 OI != OE; ++OI)
              incorporateValue(*OI);

            // Incorporate types hiding in metadata.
            I.getAllMetadataOtherThanDebugLoc(MDForInst);
            for (unsigned i = 0, e = MDForInst.size(); i != e; ++i)
              incorporateMDNode(MDForInst[i].second);

            MDForInst.clear();
          }
      }

      for (llvm::Module::const_named_metadata_iterator I = M.named_metadata_begin(),
           E = M.named_metadata_end(); I != E; ++I) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
        const llvm::NamedMDNode *NMD = I;
#else /* LLVM 3.8+ */
        const llvm::NamedMDNode *NMD = &*I;
#endif
        for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
          incorporateMDNode(NMD->getOperand(i));
      }
    }

  private:
    void incorporateType(llvm::Type *Ty) {
      // Check to see if we're already visited this type.
      if (!VisitedTypes.insert(Ty).second)
        return;

      if (llvm::ArrayType *ATy = llvm::dyn_cast<llvm::ArrayType>(Ty))
          ArrayTypes.push_back(ATy);

      // Recursively walk all contained types.
      for (llvm::Type::subtype_iterator I = Ty->subtype_begin(),
           E = Ty->subtype_end(); I != E; ++I)
        incorporateType(*I);
    }

    /// incorporateValue - This method is used to walk operand lists finding
    /// types hiding in constant expressions and other operands that won't be
    /// walked in other ways.  GlobalValues, basic blocks, instructions, and
    /// inst operands are all explicitly enumerated.
    void incorporateValue(const llvm::Value *V) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // 3.2, 3.3, 3.4, 3.5
      if (const llvm::MDNode *M = llvm::dyn_cast<llvm::MDNode>(V)) {
        incorporateMDNode(M);
        return;
      }
#else /* LLVN 3.6+ */
      if (const llvm::MetadataAsValue *MV = llvm::dyn_cast<llvm::MetadataAsValue>(V)) {
        incorporateMDNode(MV->getMetadata());
        return;
      }
#endif
      if (!llvm::isa<llvm::Constant>(V) || llvm::isa<llvm::GlobalValue>(V)) return;

      // Already visited?
      if (!VisitedConstants.insert(V).second)
        return;

      // Check this type.
      incorporateType(V->getType());

      // Look in operands for types.
      const llvm::User *U = llvm::cast<llvm::User>(V);
      for (llvm::Constant::const_op_iterator I = U->op_begin(),
           E = U->op_end(); I != E;++I)
        incorporateValue(*I);
    }

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // 3.2, 3.3, 3.4, 3.5
    void incorporateMDNode(const llvm::MDNode *V) {

      // Already visited?
      if (!VisitedConstants.insert(V).second)
        return;

      // Look in operands for types.
      for (unsigned i = 0, e = V->getNumOperands(); i != e; ++i)
        if (llvm::Value *Op = V->getOperand(i))
          incorporateValue(Op);
    }
#else // LLVM 3.6+
    void incorporateMDNode(const llvm::Metadata *M) {

      // Already visited?
      if (!VisitedMDNodes.insert(M).second)
        return;

      if (const llvm::MDNode* N = llvm::dyn_cast<llvm::MDNode>(M)) {
        // Look in operands for types.
        for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
          if (const llvm::Metadata *O = N->getOperand(i))
            incorporateMDNode(O);
      } else if (llvm::isa<llvm::MDString>(M)) {
        // Nothing to do with MDString.
      } else if (const llvm::ValueAsMetadata* V = llvm::dyn_cast<llvm::ValueAsMetadata>(M)) {
          incorporateValue(V->getValue());
      } else {
        // Some unknown Metadata subclass - has LLVM introduced something new?
        llvm_unreachable("Unknown Metadata subclass");
      }
    }
#endif

  };
} // end anonymous namespace

static void findUsedArrayAndLongIntTypes(const llvm::Module *m, std::vector<llvm::ArrayType*> &t,
                               std::vector<llvm::IntegerType*> &i, std::vector<bool> &IsVolatile,
                               std::vector<int> &Alignment) {
  TypeFinder(t, i, IsVolatile, Alignment).run(*m);
}


static bool is_vec16_i64_ty(llvm::Type *Ty) {
  llvm::VectorType *VTy = llvm::dyn_cast<llvm::VectorType>(Ty);
  if ((VTy != NULL) && (VTy->getElementType()->isIntegerTy()) &&
    VTy->getElementType()->getPrimitiveSizeInBits() == 64)
    return true;
  return false;
}

namespace {
  class CBEMCAsmInfo : public llvm::MCAsmInfo {
  public:
    CBEMCAsmInfo() {

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_4 // 3.2, 3.3, 3.4
      GlobalPrefix = "";
#endif
      PrivateGlobalPrefix = "";
    }
  };

  /// CWriter - This class is the main chunk of code that converts an LLVM
  /// module to a C translation unit.
  class CWriter : public llvm::FunctionPass, public llvm::InstVisitor<CWriter> {
    llvm::formatted_raw_ostream &Out;
    llvm::IntrinsicLowering *IL;
    //llvm::Mangler *Mang;
    llvm::LoopInfo *LI;
    const llvm::Module *TheModule;
    const llvm::MCAsmInfo* TAsm;
    const llvm::MCRegisterInfo *MRI;
    const llvm::MCObjectFileInfo *MOFI;
    llvm::MCContext *TCtx;

    // FIXME: it's ugly to have the name be "TD" here, but it saves us
    // lots of ifdefs in the below since the new DataLayout and the old
    // TargetData have generally similar interfaces...
    const llvm::DataLayout* TD;

    std::map<const llvm::ConstantFP *, unsigned> FPConstantMap;
    std::map<const llvm::ConstantDataVector *, unsigned> VectorConstantMap;
    unsigned VectorConstantIndex;
    std::set<llvm::Function*> intrinsicPrototypesAlreadyGenerated;
    std::set<const llvm::Argument*> ByValParams;
    unsigned FPCounter;
    unsigned OpaqueCounter;
    llvm::DenseMap<const llvm::Value*, unsigned> AnonValueNumbers;
    unsigned NextAnonValueNumber;

    std::string includeName;
    int vectorWidth;

    /// UnnamedStructIDs - This contains a unique ID for each struct that is
    /// either anonymous or has no name.
    llvm::DenseMap<llvm::StructType*, unsigned> UnnamedStructIDs;
    llvm::DenseMap<llvm::ArrayType *, unsigned> ArrayIDs;

  public:
    static char ID;
      explicit CWriter(llvm::formatted_raw_ostream &o, const char *incname,
                       int vecwidth)
          : FunctionPass(ID), Out(o), IL(0), /* Mang(0), */ LI(0),
        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
        OpaqueCounter(0), NextAnonValueNumber(0),
        includeName(incname ? incname : "generic_defs.h"),
        vectorWidth(vecwidth) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // <= 3.6
      initializeLoopInfoPass(*llvm::PassRegistry::getPassRegistry());
#else // LLVM 3.7+
      initializeLoopInfoWrapperPassPass(*llvm::PassRegistry::getPassRegistry());
#endif
      FPCounter = 0;
      VectorConstantIndex = 0;
    }

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    virtual const char *getPassName() const { return "C backend"; }
#else // LLVM 4.0+
    virtual llvm::StringRef getPassName() const { return "C backend"; }
#endif

    void getAnalysisUsage(llvm::AnalysisUsage &AU) const {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // <= 3.6
      AU.addRequired<llvm::LoopInfo>();
#else // LLVM 3.7+
      AU.addRequired<llvm::LoopInfoWrapperPass>();
#endif
      AU.setPreservesAll();
    }

    virtual bool doInitialization(llvm::Module &M);

    bool runOnFunction(llvm::Function &F) {
     // Do not codegen any 'available_externally' functions at all, they have
     // definitions outside the translation unit.
     if (F.hasAvailableExternallyLinkage())
       return false;

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // <= 3.6
     LI = &getAnalysis<llvm::LoopInfo>();
#else // LLVM 3.7+
     LI = &getAnalysis<llvm::LoopInfoWrapperPass>().getLoopInfo();
#endif

      // Get rid of intrinsics we can't handle.
      lowerIntrinsics(F);

      // Output all floating point constants that cannot be printed accurately.
      printFloatingPointConstants(F);

      // Output all vector constants so they can be accessed with single
      // vector loads
      printVectorConstants(F);

      printFunction(F);
      return false;
    }

    virtual bool doFinalization(llvm::Module &M) {
      // Free memory...
      delete IL;
      delete TD;
      //delete Mang;
      delete TCtx;
      delete TAsm;
      delete MRI;
      delete MOFI;
      FPConstantMap.clear();
      VectorConstantMap.clear();
      ByValParams.clear();
      intrinsicPrototypesAlreadyGenerated.clear();
      UnnamedStructIDs.clear();
      ArrayIDs.clear();
      return false;
    }

    llvm::raw_ostream &printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                           bool isSigned = false,
                           const std::string &VariableName = "",
                           bool IgnoreName = false,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                           const llvm::AttrListPtr &PAL = llvm::AttrListPtr()
#else // LLVM 3.3+
                           const llvm::AttributeSet &PAL = llvm::AttributeSet()
#endif
                                 );
    llvm::raw_ostream &printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty,
                           bool isSigned,
                           const std::string &NameSoFar = "");

    void printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                              const llvm::AttrListPtr &PAL,
#else // LLVM 3.3+
                                              const llvm::AttributeSet &PAL,
#endif
                                              llvm::PointerType *Ty);

    std::string getStructName(llvm::StructType *ST);
    std::string getArrayName(llvm::ArrayType *AT);

    /// writeOperandDeref - Print the result of dereferencing the specified
    /// operand with '*'.  This is equivalent to printing '*' then using
    /// writeOperand, but avoids excess syntax in some cases.
    void writeOperandDeref(llvm::Value *Operand) {
      if (isAddressExposed(Operand)) {
        // Already something with an address exposed.
        writeOperandInternal(Operand);
      } else {
        Out << "*(";
        writeOperand(Operand);
        Out << ")";
      }
    }

    void writeOperand(llvm::Value *Operand, bool Static = false);
    void writeInstComputationInline(llvm::Instruction &I);
    void writeOperandInternal(llvm::Value *Operand, bool Static = false);
    void writeOperandWithCast(llvm::Value* Operand, unsigned Opcode);
    void writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &I);
    bool writeInstructionCast(const llvm::Instruction &I);

    void writeMemoryAccess(llvm::Value *Operand, llvm::Type *OperandType,
                           bool IsVolatile, unsigned Alignment);

  private :
    void lowerIntrinsics(llvm::Function &F);
    /// Prints the definition of the intrinsic function F. Supports the
    /// intrinsics which need to be explicitly defined in the CBackend.
    void printIntrinsicDefinition(const llvm::Function &F, llvm::raw_ostream &Out);

    void printModuleTypes();
    void printContainedStructs(llvm::Type *Ty, llvm::SmallPtrSet<llvm::Type *, 16> &);
    void printContainedArrays(llvm::ArrayType *ATy, llvm::SmallPtrSet<llvm::Type *, 16> &);
    void printFloatingPointConstants(llvm::Function &F);
    void printFloatingPointConstants(const llvm::Constant *C);
    void printVectorConstants(llvm::Function &F);
    void printFunctionSignature(const llvm::Function *F, bool Prototype);

    void printFunction(llvm::Function &);
    void printBasicBlock(llvm::BasicBlock *BB);
    void printLoop(llvm::Loop *L);

    bool printCast(unsigned opcode, llvm::Type *SrcTy, llvm::Type *DstTy);
    void printConstant(llvm::Constant *CPV, bool Static);
    void printConstantWithCast(llvm::Constant *CPV, unsigned Opcode);
    bool printConstExprCast(const llvm::ConstantExpr *CE, bool Static);
    void printConstantArray(llvm::ConstantArray *CPA, bool Static);
    void printConstantVector(llvm::ConstantVector *CV, bool Static);
    void printConstantDataSequential(llvm::ConstantDataSequential *CDS, bool Static);

    /// isAddressExposed - Return true if the specified value's name needs to
    /// have its address taken in order to get a C value of the correct type.
    /// This happens for global variables, byval parameters, and direct allocas.
      bool isAddressExposed(const llvm::Value *V) const {
        if (const llvm::Argument *A = llvm::dyn_cast<llvm::Argument>(V))
          return ByValParams.count(A);
        return llvm::isa<llvm::GlobalVariable>(V) || isDirectAlloca(V);
    }

    // isInlinableInst - Attempt to inline instructions into their uses to build
    // trees as much as possible.  To do this, we have to consistently decide
    // what is acceptable to inline, so that variable declarations don't get
    // printed and an extra copy of the expr is not emitted.
    //
    static bool isInlinableInst(const llvm::Instruction &I) {
      // Always inline cmp instructions, even if they are shared by multiple
      // expressions.  GCC generates horrible code if we don't.
      if (llvm::isa<llvm::CmpInst>(I) && llvm::isa<llvm::VectorType>(I.getType()) == false)
        return true;

#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // 3.5+
      // This instruction returns a struct on LLVM older than 3.4, and can not be inlined
      if (llvm::isa<llvm::AtomicCmpXchgInst>(I))
        return false;
#endif

      // Must be an expression, must be used exactly once.  If it is dead, we
      // emit it inline where it would go.
      if (I.getType() == llvm::Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
        llvm::isa<llvm::TerminatorInst>(I) || llvm::isa<llvm::CallInst>(I) || llvm::isa<llvm::PHINode>(I) ||
        llvm::isa<llvm::LoadInst>(I) || llvm::isa<llvm::VAArgInst>(I) || llvm::isa<llvm::InsertElementInst>(I) ||
        llvm::isa<llvm::InsertValueInst>(I) || llvm::isa<llvm::ExtractValueInst>(I) || llvm::isa<llvm::SelectInst>(I))
        // Don't inline a load across a store or other bad things!
        return false;

      // Must not be used in inline asm, extractelement, or shufflevector.
      if (I.hasOneUse()) {

#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // 3.5+
        const llvm::Instruction &User = llvm::cast<llvm::Instruction>(*I.user_back());
#else
        const llvm::Instruction &User = llvm::cast<llvm::Instruction>(*I.use_back());
#endif
        if (isInlineAsm(User) || llvm::isa<llvm::ExtractElementInst>(User) ||
            llvm::isa<llvm::ShuffleVectorInst>(User) || llvm::isa<llvm::AtomicRMWInst>(User) ||
            llvm::isa<llvm::AtomicCmpXchgInst>(User))
          return false;
      }

      // Only inline instruction it if it's use is in the same BB as the inst.
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // 3.5+
      return I.getParent() == llvm::cast<llvm::Instruction>(I.user_back())->getParent();
#else
      return I.getParent() == llvm::cast<llvm::Instruction>(I.use_back())->getParent();
#endif
    }

    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
    // variables which are accessed with the & operator.  This causes GCC to
    // generate significantly better code than to emit alloca calls directly.
    //
    static const llvm::AllocaInst *isDirectAlloca(const llvm::Value *V) {
      const llvm::AllocaInst *AI = llvm::dyn_cast<llvm::AllocaInst>(V);
      if (!AI) return 0;
      if (AI->isArrayAllocation())
        return 0;   // FIXME: we can also inline fixed size array allocas!
      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
        return 0;
      return AI;
    }

    // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
    static bool isInlineAsm(const llvm::Instruction& I) {
      if (const llvm::CallInst *CI = llvm::dyn_cast<llvm::CallInst>(&I))
        return llvm::isa<llvm::InlineAsm>(CI->getCalledValue());
      return false;
    }

    // Instruction visitation functions
    friend class llvm::InstVisitor<CWriter>;

    void visitReturnInst(llvm::ReturnInst &I);
    void visitBranchInst(llvm::BranchInst &I);
    void visitSwitchInst(llvm::SwitchInst &I);
    void visitIndirectBrInst(llvm::IndirectBrInst &I);
    void visitInvokeInst(llvm::InvokeInst &I) {
      llvm_unreachable("Lowerinvoke pass didn't work!");
    }
    void visitResumeInst(llvm::ResumeInst &I) {
      llvm_unreachable("DwarfEHPrepare pass didn't work!");
    }
    void visitUnreachableInst(llvm::UnreachableInst &I);

    void visitPHINode(llvm::PHINode &I);
    void visitBinaryOperator(llvm::Instruction &I);
    void visitICmpInst(llvm::ICmpInst &I);
    void visitFCmpInst(llvm::FCmpInst &I);

    void visitCastInst (llvm::CastInst &I);
    void visitSelectInst(llvm::SelectInst &I);
    void visitCallInst (llvm::CallInst &I);
    void visitInlineAsm(llvm::CallInst &I);
    bool visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID, bool &WroteCallee);

    void visitAllocaInst(llvm::AllocaInst &I);
    void visitLoadInst  (llvm::LoadInst   &I);
    void visitStoreInst (llvm::StoreInst  &I);
    void visitGetElementPtrInst(llvm::GetElementPtrInst &I);
    void visitVAArgInst (llvm::VAArgInst &I);

    void visitInsertElementInst(llvm::InsertElementInst &I);
    void visitExtractElementInst(llvm::ExtractElementInst &I);
    void visitShuffleVectorInst(llvm::ShuffleVectorInst &SVI);

    void visitInsertValueInst(llvm::InsertValueInst &I);
    void visitExtractValueInst(llvm::ExtractValueInst &I);

    void visitAtomicRMWInst(llvm::AtomicRMWInst &I);
    void visitAtomicCmpXchgInst(llvm::AtomicCmpXchgInst &I);

    void visitInstruction(llvm::Instruction &I) {
#ifndef NDEBUG
      llvm::errs() << "C Writer does not know about " << I;
#endif
      llvm_unreachable(0);
    }

    void outputLValue(llvm::Instruction *I) {
      Out << "  " << GetValueName(I) << " = ";
    }

    bool isGotoCodeNecessary(llvm::BasicBlock *From, llvm::BasicBlock *To);
    void printPHICopiesForSuccessor(llvm::BasicBlock *CurBlock,
                                    llvm::BasicBlock *Successor, unsigned Indent);
    void printBranchToBlock(llvm::BasicBlock *CurBlock, llvm::BasicBlock *SuccBlock,
                            unsigned Indent);
    void printGEPExpression(llvm::Value *Ptr, llvm::gep_type_iterator I,
                            llvm::gep_type_iterator E, bool Static);

    std::string GetValueName(const llvm::Value *Operand);
  };
}

char CWriter::ID = 0;


static std::string CBEMangle(const std::string &S) {
  std::string Result;

  for (unsigned i = 0, e = S.size(); i != e; ++i) {
    if (i+1 != e && ((S[i] == '>' && S[i+1] == '>') ||
                     (S[i] == '<' && S[i+1] == '<'))) {
      Result += '_';
      Result += 'A'+(S[i]&15);
      Result += 'A'+((S[i]>>4)&15);
      Result += '_';
      i++;
    } else if (isalnum(S[i]) || S[i] == '_' || S[i] == '<' || S[i] == '>') {
      Result += S[i];
    } else {
      Result += '_';
      Result += 'A'+(S[i]&15);
      Result += 'A'+((S[i]>>4)&15);
      Result += '_';
    }
  }
  return Result;
}

std::string CWriter::getStructName(llvm::StructType *ST) {
  if (!ST->isLiteral() && !ST->getName().empty())
    return CBEMangle("l_"+ST->getName().str());

  return "l_unnamed_" + llvm::utostr(UnnamedStructIDs[ST]);
}

std::string CWriter::getArrayName(llvm::ArrayType *AT) {
  return "l_array_" + llvm::utostr(ArrayIDs[AT]);
}


/// printStructReturnPointerFunctionType - This is like printType for a struct
/// return type, except, instead of printing the type as void (*)(Struct*, ...)
/// print it as "Struct (*)(...)", for struct return functions.
void CWriter::printStructReturnPointerFunctionType(llvm::raw_ostream &Out,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                                   const llvm::AttrListPtr &PAL,
#else // LLVM 3.3+
                                                   const llvm::AttributeSet &PAL,
#endif
                                                   llvm::PointerType *TheTy) {
  llvm::FunctionType *FTy = llvm::cast<llvm::FunctionType>(TheTy->getElementType());
  std::string tstr;
  llvm::raw_string_ostream FunctionInnards(tstr);
  FunctionInnards << " (*) (";
  bool PrintedType = false;

  llvm::FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
  llvm::Type *RetTy = llvm::cast<llvm::PointerType>(*I)->getElementType();
  unsigned Idx = 1;
  for (++I, ++Idx; I != E; ++I, ++Idx) {
    if (PrintedType)
      FunctionInnards << ", ";
    llvm::Type *ArgTy = *I;
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
    if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
#else // LLVM 3.3+
        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
#endif
      assert(ArgTy->isPointerTy());
      ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
    }
    printType(FunctionInnards, ArgTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
              PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
#else // LLVM 3.3+
              PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
#endif
              "");
    PrintedType = true;
  }
  if (FTy->isVarArg()) {
    if (!PrintedType)
      FunctionInnards << " int"; //dummy argument for empty vararg functs
    FunctionInnards << ", ...";
  } else if (!PrintedType) {
    FunctionInnards << "void";
  }
  FunctionInnards << ')';
  printType(Out, RetTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
            PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
#else // LLVM 3.3+
            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
#endif
            FunctionInnards.str());
}

llvm::raw_ostream &
CWriter::printSimpleType(llvm::raw_ostream &Out, llvm::Type *Ty, bool isSigned,
                         const std::string &NameSoFar) {
  assert((Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) &&
         "Invalid type for printSimpleType");
  switch (Ty->getTypeID()) {
  case llvm::Type::VoidTyID:   return Out << "void " << NameSoFar;
  case llvm::Type::IntegerTyID: {
    unsigned NumBits = llvm::cast<llvm::IntegerType>(Ty)->getBitWidth();
    if (NumBits == 1)
      return Out << "bool " << NameSoFar;
    else if (NumBits <= 8)
      return Out << (isSigned?"":"u") << "int8_t " << NameSoFar;
    else if (NumBits <= 16)
      return Out << (isSigned?"":"u") << "int16_t " << NameSoFar;
    else if (NumBits <= 32)
      return Out << (isSigned?"":"u") << "int32_t " << NameSoFar;
    else if (NumBits <= 64)
      return Out << (isSigned?"":"u") << "int64_t "<< NameSoFar;
    else
      return Out << "iN<" << NumBits << "> " << NameSoFar;
  }
  case llvm::Type::FloatTyID:  return Out << "float "   << NameSoFar;
  case llvm::Type::DoubleTyID: return Out << "double "  << NameSoFar;
  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
  // present matches host 'long double'.
  case llvm::Type::X86_FP80TyID:
  case llvm::Type::PPC_FP128TyID:
  case llvm::Type::FP128TyID:  return Out << "long double " << NameSoFar;

  case llvm::Type::X86_MMXTyID:
    return printSimpleType(Out, llvm::Type::getInt32Ty(Ty->getContext()), isSigned,
                     " __attribute__((vector_size(64))) " + NameSoFar);

  case llvm::Type::VectorTyID: {
    llvm::VectorType *VTy = llvm::cast<llvm::VectorType>(Ty);
#if 1
    const char *suffix = NULL;
    const llvm::Type *eltTy = VTy->getElementType();
    if (eltTy->isFloatTy())
        suffix = "f";
    else if (eltTy->isDoubleTy())
        suffix = "d";
    else {
        assert(eltTy->isIntegerTy());
        switch (eltTy->getPrimitiveSizeInBits()) {
        case 1:
            suffix = "i1";
            break;
        case 8:
            suffix = "i8";
            break;
        case 16:
            suffix = "i16";
            break;
        case 32:
            suffix = "i32";
            break;
        case 64:
            suffix = "i64";
            break;
        default:
            suffix = "iN";
            break;
        }
    }

    return Out << "__vec" << VTy->getNumElements() << "_" << suffix << " " <<
        NameSoFar;
#else
    return printSimpleType(Out, VTy->getElementType(), isSigned,
                     " __attribute__((vector_size(" +
                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
#endif
  }

  default:
#ifndef NDEBUG
    llvm::errs() << "Unknown primitive type: " << *Ty << "\n";
#endif
    llvm_unreachable(0);
  }
}

// Pass the Type* and the variable name and this prints out the variable
// declaration.
//
llvm::raw_ostream &CWriter::printType(llvm::raw_ostream &Out, llvm::Type *Ty,
                                bool isSigned, const std::string &NameSoFar,
                                bool IgnoreName,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                                const llvm::AttrListPtr &PAL
#else /* LLVM 3.3+ */
                                const llvm::AttributeSet &PAL
#endif
                                      ) {

  if (Ty->isFloatingPointTy() || Ty->isX86_MMXTy() || Ty->isIntegerTy() || Ty->isVectorTy() || Ty->isVoidTy()) {
    printSimpleType(Out, Ty, isSigned, NameSoFar);
    return Out;
  }

  switch (Ty->getTypeID()) {
  case llvm::Type::FunctionTyID: {
    llvm::FunctionType *FTy = llvm::cast<llvm::FunctionType>(Ty);
    std::string tstr;
    llvm::raw_string_ostream FunctionInnards(tstr);
    FunctionInnards << " (" << NameSoFar << ") (";
    unsigned Idx = 1;
    for (llvm::FunctionType::param_iterator I = FTy->param_begin(),
           E = FTy->param_end(); I != E; ++I) {
      llvm::Type *ArgTy = *I;
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
      if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
#else /* LLVM 3.3+ */
          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
#endif
        assert(ArgTy->isPointerTy());
        ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
      }
      if (I != FTy->param_begin())
        FunctionInnards << ", ";
      printType(FunctionInnards, ArgTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
#else /* LLVM 3.3+ */
                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
#endif
                "");
      ++Idx;
    }
    if (FTy->isVarArg()) {
      if (!FTy->getNumParams())
        FunctionInnards << " int"; //dummy argument for empty vaarg functs
      FunctionInnards << ", ...";
    } else if (!FTy->getNumParams()) {
      FunctionInnards << "void";
    }
    FunctionInnards << ')';
    printType(Out, FTy->getReturnType(),
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
              PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
#else /* LLVM 3.3+ */
              PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
#endif
              FunctionInnards.str());
    return Out;
  }
  case llvm::Type::StructTyID: {
    llvm::StructType *STy = llvm::cast<llvm::StructType>(Ty);

    // Check to see if the type is named.
    if (!IgnoreName)
      return Out << getStructName(STy) << ' ' << NameSoFar;

    Out << "struct " << NameSoFar << " {\n";

    // print initialization func
    if (STy->getNumElements() > 0) {
        Out << "  static " << NameSoFar << " init(";
        unsigned Idx = 0;
        for (llvm::StructType::element_iterator I = STy->element_begin(),
                 E = STy->element_end(); I != E; ++I, ++Idx) {
            char buf[64];
            sprintf(buf, "v%d", Idx);
            printType(Out, *I, false, buf);
            if (Idx + 1 < STy->getNumElements())
                Out << ", ";
        }
        Out << ") {\n";
        Out << "    " << NameSoFar << " ret;\n";
        for (Idx = 0; Idx < STy->getNumElements(); ++Idx)
            Out << "    ret.field" << Idx << " = v" << Idx << ";\n";
        Out << "    return ret;\n";
        Out << "  }\n";
    }

    unsigned Idx = 0;
    for (llvm::StructType::element_iterator I = STy->element_begin(),
           E = STy->element_end(); I != E; ++I) {
      Out << "  ";
      printType(Out, *I, false, "field" + llvm::utostr(Idx++));
      Out << ";\n";
    }
    Out << '}';
    if (STy->isPacked())
      Out << " __attribute__ ((packed))";
    return Out;
  }

  case llvm::Type::PointerTyID: {
    llvm::PointerType *PTy = llvm::cast<llvm::PointerType>(Ty);
    std::string ptrName = "*" + NameSoFar;

    if (PTy->getElementType()->isArrayTy() ||
        PTy->getElementType()->isVectorTy())
      ptrName = "(" + ptrName + ")";

    if (!PAL.isEmpty())
      // Must be a function ptr cast!
      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
    return printType(Out, PTy->getElementType(), false, ptrName);
  }

  case llvm::Type::ArrayTyID: {
    llvm::ArrayType *ATy = llvm::cast<llvm::ArrayType>(Ty);

    // Check to see if the type is named.
    if (!IgnoreName)
      return Out << getArrayName(ATy) << ' ' << NameSoFar;

    unsigned NumElements = (unsigned)ATy->getNumElements();
    if (NumElements == 0) NumElements = 1;
    // Arrays are wrapped in structs to allow them to have normal
    // value semantics (avoiding the array "decay").
    Out << "struct " << NameSoFar << " {\n";
    // init func
    Out << "  static " << NameSoFar << " init(";
    for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
        char buf[64];
        sprintf(buf, "v%d", Idx);
        printType(Out, ATy->getElementType(), false, buf);
        if (Idx + 1 < NumElements)
            Out << ", ";
    }
    Out << ") {\n";
    Out << "    " << NameSoFar << " ret;\n";
    for (unsigned Idx = 0; Idx < NumElements; ++Idx)
        Out << "    ret.array[" << Idx << "] = v" << Idx << ";\n";
    Out << "    return ret;\n";
    Out << "  }\n  ";

    // if it's an array of i8s, also provide a version that takes a const
    // char *
    if (ATy->getElementType() == LLVMTypes::Int8Type) {
        Out << "  static " << NameSoFar << " init(const char *p) {\n";
        Out << "    " << NameSoFar << " ret;\n";
        Out << "    memcpy((uint8_t *)ret.array, (uint8_t *)p, " << NumElements << ");\n";
        Out << "    return ret;\n";
        Out << "  }\n";
    }

    printType(Out, ATy->getElementType(), false,
              "array[" + llvm::utostr(NumElements) + "]");
    return Out << ";\n} ";
  }

  default:
    llvm_unreachable("Unhandled case in getTypeProps!");
  }
}

void CWriter::printConstantArray(llvm::ConstantArray *CPA, bool Static) {
  // vec16_i64 should be handled separately

  if (is_vec16_i64_ty(CPA->getOperand(0)->getType())) {
    Out << "/* vec16_i64 should be loaded carefully on knc */";
    Out << "\n#if defined(KNC)\n";
    Out << "hilo2zmm";
    Out << "\n#endif\n";
  }
  Out << "(";
  printConstant(llvm::cast<llvm::Constant>(CPA->getOperand(0)), Static);
  Out << ")";

  for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
    Out << ", ";

    if (is_vec16_i64_ty(CPA->getOperand(i)->getType())) {
      Out << "/* vec16_i64 should be loaded carefully on knc */";
      Out << "\n#if defined(KNC) \n";
      Out << "hilo2zmm";
      Out << "\n#endif \n";
    }
    Out << "(";
    printConstant(llvm::cast<llvm::Constant>(CPA->getOperand(i)), Static);
    Out << ")";
  }
}

void CWriter::printConstantVector(llvm::ConstantVector *CP, bool Static) {
  printConstant(llvm::cast<llvm::Constant>(CP->getOperand(0)), Static);
  for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
    Out << ", ";
    printConstant(llvm::cast<llvm::Constant>(CP->getOperand(i)), Static);
  }
}

void CWriter::printConstantDataSequential(llvm::ConstantDataSequential *CDS,
                                          bool Static) {
  // As a special case, print the array as a string if it is an array of
  // ubytes or an array of sbytes with positive values.
  //
  if (CDS->isCString()) {
    Out << '\"';
    // Keep track of whether the last number was a hexadecimal escape.
    bool LastWasHex = false;

    llvm::StringRef Bytes = CDS->getAsCString();

    // Do not include the last character, which we know is null
    for (unsigned i = 0, e = Bytes.size(); i != e; ++i) {
      unsigned char C = Bytes[i];

      // Print it out literally if it is a printable character.  The only thing
      // to be careful about is when the last letter output was a hex escape
      // code, in which case we have to be careful not to print out hex digits
      // explicitly (the C compiler thinks it is a continuation of the previous
      // character, sheesh...)
      //
      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
        LastWasHex = false;
        if (C == '"' || C == '\\')
          Out << "\\" << (char)C;
        else
          Out << (char)C;
      } else {
        LastWasHex = false;
        switch (C) {
          case '\n': Out << "\\n"; break;
          case '\t': Out << "\\t"; break;
          case '\r': Out << "\\r"; break;
          case '\v': Out << "\\v"; break;
          case '\a': Out << "\\a"; break;
          case '\"': Out << "\\\""; break;
          case '\'': Out << "\\\'"; break;
          default:
            Out << "\\x";
            Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
            Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
            LastWasHex = true;
            break;
        }
      }
    }
    Out << '\"';
  } else {
    printConstant(CDS->getElementAsConstant(0), Static);
    for (unsigned i = 1, e = CDS->getNumElements(); i != e; ++i) {
      Out << ", ";
      printConstant(CDS->getElementAsConstant(i), Static);
    }
  }
}

static inline std::string ftostr(const llvm::APFloat& V) {
  std::string Buf;
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
  if (&V.getSemantics() == &llvm::APFloat::IEEEdouble) {
    llvm::raw_string_ostream(Buf) << V.convertToDouble();
    return Buf;
  } else if (&V.getSemantics() == &llvm::APFloat::IEEEsingle) {
    llvm::raw_string_ostream(Buf) << (double)V.convertToFloat();
    return Buf;
  }
#else // LLVM 4.0+
  if (&V.getSemantics() == &llvm::APFloat::IEEEdouble()) {
    llvm::raw_string_ostream(Buf) << V.convertToDouble();
    return Buf;
  } else if (&V.getSemantics() == &llvm::APFloat::IEEEsingle()) {
    llvm::raw_string_ostream(Buf) << (double)V.convertToFloat();
    return Buf;
  }
#endif
  return "<unknown format in ftostr>"; // error
}

// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
// textually as a double (rather than as a reference to a stack-allocated
// variable). We decide this by converting CFP to a string and back into a
// double, and then checking whether the conversion results in a bit-equal
// double to the original value of CFP. This depends on us and the target C
// compiler agreeing on the conversion process (which is pretty likely since we
// only deal in IEEE FP).
//
static bool isFPCSafeToPrint(const llvm::ConstantFP *CFP) {
  bool ignored;
  // Do long doubles in hex for now.
  if (CFP->getType() != llvm::Type::getFloatTy(CFP->getContext()) &&
      CFP->getType() != llvm::Type::getDoubleTy(CFP->getContext()))
    return false;
  llvm::APFloat APF = llvm::APFloat(CFP->getValueAPF());  // copy
  if (CFP->getType() == llvm::Type::getFloatTy(CFP->getContext()))
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    APF.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmNearestTiesToEven, &ignored);
#else // LLVM 4.0+
    APF.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmNearestTiesToEven, &ignored);
#endif
#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
  char Buffer[100];
  sprintf(Buffer, "%a", APF.convertToDouble());
  if (!strncmp(Buffer, "0x", 2) ||
      !strncmp(Buffer, "-0x", 3) ||
      !strncmp(Buffer, "+0x", 3))
    return APF.bitwiseIsEqual(llvm::APFloat(atof(Buffer)));
  return false;
#else
  std::string StrVal = ftostr(APF);

  while (StrVal[0] == ' ')
    StrVal.erase(StrVal.begin());

  // Check to make sure that the stringized number is not some string like "Inf"
  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
      ((StrVal[0] == '-' || StrVal[0] == '+') &&
       (StrVal[1] >= '0' && StrVal[1] <= '9')))
    // Reparse stringized version!
    return APF.bitwiseIsEqual(llvm::APFloat(atof(StrVal.c_str())));
  return false;
#endif
}

/// Print out the casting for a cast operation. This does the double casting
/// necessary for conversion to the destination type, if necessary.
/// Return value indicates whether a closing paren is needed.
/// @brief Print a cast
bool CWriter::printCast(unsigned opc, llvm::Type *SrcTy, llvm::Type *DstTy) {
  if (llvm::isa<const llvm::VectorType>(DstTy)) {
      assert(llvm::isa<const llvm::VectorType>(SrcTy));
      switch (opc) {
      case llvm::Instruction::UIToFP:   Out << "__cast_uitofp("; break;
      case llvm::Instruction::SIToFP:   Out << "__cast_sitofp("; break;
      case llvm::Instruction::IntToPtr: llvm_unreachable("Invalid vector cast");
      case llvm::Instruction::Trunc:    Out << "__cast_trunc("; break;
      case llvm::Instruction::BitCast:  Out << "__cast_bits("; break;
      case llvm::Instruction::FPExt:    Out << "__cast_fpext("; break;
      case llvm::Instruction::FPTrunc:  Out << "__cast_fptrunc("; break;
      case llvm::Instruction::ZExt:     Out << "__cast_zext("; break;
      case llvm::Instruction::PtrToInt: llvm_unreachable("Invalid vector cast");
      case llvm::Instruction::FPToUI:   Out << "__cast_fptoui("; break;
      case llvm::Instruction::SExt:     Out << "__cast_sext("; break;
      case llvm::Instruction::FPToSI:   Out << "__cast_fptosi("; break;
      default:
          llvm_unreachable("Invalid cast opcode");
      }

      // print a call to the constructor for the destination type for the
      // first arg; this bogus first parameter is only used to convey the
      // desired return type to the callee.
      printType(Out, DstTy);
      Out << "(), ";

      return true;
  }

  // Print the destination type cast
  switch (opc) {
    case llvm::Instruction::BitCast: {
        if (DstTy->isPointerTy()) {
            Out << '(';
            printType(Out, DstTy);
            Out << ')';
            break;
        }
        else {
            Out << "__cast_bits((";
            printType(Out, DstTy);
            Out << ")0, ";
            return true;
        }
    }
    case llvm::Instruction::UIToFP:
    case llvm::Instruction::SIToFP:
    case llvm::Instruction::IntToPtr:
    case llvm::Instruction::Trunc:
    case llvm::Instruction::FPExt:
    case llvm::Instruction::FPTrunc: // For these the DstTy sign doesn't matter
      Out << '(';
      printType(Out, DstTy);
      Out << ')';
      break;
    case llvm::Instruction::ZExt:
    case llvm::Instruction::PtrToInt:
    case llvm::Instruction::FPToUI: // For these, make sure we get an unsigned dest
      Out << '(';
      printSimpleType(Out, DstTy, false);
      Out << ')';
      break;
    case llvm::Instruction::SExt:
    case llvm::Instruction::FPToSI: // For these, make sure we get a signed dest
      Out << '(';
      printSimpleType(Out, DstTy, true);
      Out << ')';
      break;
    default:
      llvm_unreachable("Invalid cast opcode");
  }

  // Print the source type cast
  switch (opc) {
    case llvm::Instruction::UIToFP:
    case llvm::Instruction::ZExt:
      Out << '(';
      printSimpleType(Out, SrcTy, false);
      Out << ')';
      break;
    case llvm::Instruction::SIToFP:
    case llvm::Instruction::SExt:
      Out << '(';
      printSimpleType(Out, SrcTy, true);
      Out << ')';
      break;
    case llvm::Instruction::IntToPtr:
    case llvm::Instruction::PtrToInt:
      // Avoid "cast to pointer from integer of different size" warnings
      Out << "(unsigned long)";
      break;
    case llvm::Instruction::Trunc:
    case llvm::Instruction::BitCast:
    case llvm::Instruction::FPExt:
    case llvm::Instruction::FPTrunc:
    case llvm::Instruction::FPToSI:
    case llvm::Instruction::FPToUI:
      break; // These don't need a source cast.
    default:
      llvm_unreachable("Invalid cast opcode");
      break;
  }
  return false;
}


/** Construct the name of a function with the given base and returning a
    vector of a given type, of the specified idth.  For example, if base
    is "foo" and matchType is i32 and width is 16, this will return the
    string "__foo_i32<__vec16_i32>".
 */
static const char *
lGetTypedFunc(const char *base, llvm::Type *matchType, int width) {
    static const char *ty_desc_str[] = {"f", "d", "i1", "i8", "i16", "i32", "i64"};
    static const char *fn_desc_str[] = {"float", "double", "i1", "i8", "i16", "i32", "i64"};
    enum {DESC_FLOAT, DESC_DOUBLE, DESC_I1, DESC_I8, DESC_I16, DESC_I32, DESC_I64} desc;

    switch (matchType->getTypeID()) {
    case llvm::Type::FloatTyID:  desc = DESC_FLOAT; break;
    case llvm::Type::DoubleTyID: desc = DESC_DOUBLE; break;
    case llvm::Type::IntegerTyID: {
        switch (llvm::cast<llvm::IntegerType>(matchType)->getBitWidth()) {
        case 1:  desc = DESC_I1;  break;
        case 8:  desc = DESC_I8;  break;
        case 16: desc = DESC_I16; break;
        case 32: desc = DESC_I32; break;
        case 64: desc = DESC_I64; break;
        default: return NULL;
        }
        break;
    }
    default: return NULL;
    }

    char buf[64];
    snprintf(buf, 64, "__%s_%s<__vec%d_%s>", base, fn_desc_str[desc], width, ty_desc_str[desc]);
    return strdup(buf);
}


// printConstant - The LLVM Constant to C Constant converter.
void CWriter::printConstant(llvm::Constant *CPV, bool Static) {
  if (const llvm::ConstantExpr *CE = llvm::dyn_cast<llvm::ConstantExpr>(CPV)) {
    if (llvm::isa<llvm::VectorType>(CPV->getType())) {
        assert(CE->getOpcode() == llvm::Instruction::BitCast);
        llvm::ConstantExpr *Op = llvm::dyn_cast<llvm::ConstantExpr>(CE->getOperand(0));
        assert(Op && Op->getOpcode() == llvm::Instruction::BitCast);
        assert(llvm::isa<llvm::VectorType>(Op->getOperand(0)->getType()));

        Out << "(__cast_bits(";
        printType(Out, CE->getType());
        Out << "(), ";
        printConstant(Op->getOperand(0), Static);
        Out << "))";
        return;
    }
    switch (CE->getOpcode()) {
    case llvm::Instruction::Trunc:
    case llvm::Instruction::ZExt:
    case llvm::Instruction::SExt:
    case llvm::Instruction::FPTrunc:
    case llvm::Instruction::FPExt:
    case llvm::Instruction::UIToFP:
    case llvm::Instruction::SIToFP:
    case llvm::Instruction::FPToUI:
    case llvm::Instruction::FPToSI:
    case llvm::Instruction::PtrToInt:
    case llvm::Instruction::IntToPtr:
    case llvm::Instruction::BitCast: {
      if (CE->getOpcode() == llvm::Instruction::BitCast &&
          CE->getType()->isPointerTy() == false) {
          Out << "__cast_bits((";
          printType(Out, CE->getType());
          Out << ")0, ";
          printConstant(CE->getOperand(0), Static);
          Out << ")";
          return;
      }

      Out << "(";
      bool closeParen = printCast(CE->getOpcode(), CE->getOperand(0)->getType(),
                                  CE->getType());
      if (CE->getOpcode() == llvm::Instruction::SExt &&
          CE->getOperand(0)->getType() == llvm::Type::getInt1Ty(CPV->getContext())) {
        // Make sure we really sext from bool here by subtracting from 0
        Out << "0-";
      }
      printConstant(CE->getOperand(0), Static);
      if (CE->getType() == llvm::Type::getInt1Ty(CPV->getContext()) &&
          (CE->getOpcode() == llvm::Instruction::Trunc ||
           CE->getOpcode() == llvm::Instruction::FPToUI ||
           CE->getOpcode() == llvm::Instruction::FPToSI ||
           CE->getOpcode() == llvm::Instruction::PtrToInt)) {
        // Make sure we really truncate to bool here by anding with 1
        Out << "&1u";
      }
      Out << ')';
      if (closeParen)
          Out << ')';
      return;
    }
    case llvm::Instruction::GetElementPtr:
        assert(!llvm::isa<llvm::VectorType>(CPV->getType()));
        Out << "(";
        printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
                           gep_type_end(CPV), Static);
        Out << ")";
        return;
    case llvm::Instruction::Select:
        assert(!llvm::isa<llvm::VectorType>(CPV->getType()));
        Out << '(';
        printConstant(CE->getOperand(0), Static);
        Out << '?';
        printConstant(CE->getOperand(1), Static);
        Out << ':';
        printConstant(CE->getOperand(2), Static);
        Out << ')';
        return;
    case llvm::Instruction::Add:
    case llvm::Instruction::FAdd:
    case llvm::Instruction::Sub:
    case llvm::Instruction::FSub:
    case llvm::Instruction::Mul:
    case llvm::Instruction::FMul:
    case llvm::Instruction::SDiv:
    case llvm::Instruction::UDiv:
    case llvm::Instruction::FDiv:
    case llvm::Instruction::URem:
    case llvm::Instruction::SRem:
    case llvm::Instruction::FRem:
    case llvm::Instruction::And:
    case llvm::Instruction::Or:
    case llvm::Instruction::Xor:
    case llvm::Instruction::ICmp:
    case llvm::Instruction::Shl:
    case llvm::Instruction::LShr:
    case llvm::Instruction::AShr:
    {
      assert(!llvm::isa<llvm::VectorType>(CPV->getType()));
      Out << '(';
      bool NeedsClosingParens = printConstExprCast(CE, Static);
      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
      switch (CE->getOpcode()) {
      case llvm::Instruction::Add:
      case llvm::Instruction::FAdd: Out << " + "; break;
      case llvm::Instruction::Sub:
      case llvm::Instruction::FSub: Out << " - "; break;
      case llvm::Instruction::Mul:
      case llvm::Instruction::FMul: Out << " * "; break;
      case llvm::Instruction::URem:
      case llvm::Instruction::SRem:
      case llvm::Instruction::FRem: Out << " % "; break;
      case llvm::Instruction::UDiv:
      case llvm::Instruction::SDiv:
      case llvm::Instruction::FDiv: Out << " / "; break;
      case llvm::Instruction::And: Out << " & "; break;
      case llvm::Instruction::Or:  Out << " | "; break;
      case llvm::Instruction::Xor: Out << " ^ "; break;
      case llvm::Instruction::Shl: Out << " << "; break;
      case llvm::Instruction::LShr:
      case llvm::Instruction::AShr: Out << " >> "; break;
      case llvm::Instruction::ICmp:
        switch (CE->getPredicate()) {
          case llvm::ICmpInst::ICMP_EQ: Out << " == "; break;
          case llvm::ICmpInst::ICMP_NE: Out << " != "; break;
          case llvm::ICmpInst::ICMP_SLT:
          case llvm::ICmpInst::ICMP_ULT: Out << " < "; break;
          case llvm::ICmpInst::ICMP_SLE:
          case llvm::ICmpInst::ICMP_ULE: Out << " <= "; break;
          case llvm::ICmpInst::ICMP_SGT:
          case llvm::ICmpInst::ICMP_UGT: Out << " > "; break;
          case llvm::ICmpInst::ICMP_SGE:
          case llvm::ICmpInst::ICMP_UGE: Out << " >= "; break;
          default: llvm_unreachable("Illegal ICmp predicate");
        }
        break;
      default: llvm_unreachable("Illegal opcode here!");
      }
      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
      if (NeedsClosingParens)
        Out << "))";
      Out << ')';
      return;
    }
    case llvm::Instruction::FCmp: {
      assert(!llvm::isa<llvm::VectorType>(CPV->getType()));
      Out << '(';
      bool NeedsClosingParens = printConstExprCast(CE, Static);
      if (CE->getPredicate() == llvm::FCmpInst::FCMP_FALSE)
        Out << "0";
      else if (CE->getPredicate() == llvm::FCmpInst::FCMP_TRUE)
        Out << "1";
      else {
        const char* op = 0;
        switch (CE->getPredicate()) {
        default: llvm_unreachable("Illegal FCmp predicate");
        case llvm::FCmpInst::FCMP_ORD: op = "ord"; break;
        case llvm::FCmpInst::FCMP_UNO: op = "uno"; break;
        case llvm::FCmpInst::FCMP_UEQ: op = "ueq"; break;
        case llvm::FCmpInst::FCMP_UNE: op = "une"; break;
        case llvm::FCmpInst::FCMP_ULT: op = "ult"; break;
        case llvm::FCmpInst::FCMP_ULE: op = "ule"; break;
        case llvm::FCmpInst::FCMP_UGT: op = "ugt"; break;
        case llvm::FCmpInst::FCMP_UGE: op = "uge"; break;
        case llvm::FCmpInst::FCMP_OEQ: op = "oeq"; break;
        case llvm::FCmpInst::FCMP_ONE: op = "one"; break;
        case llvm::FCmpInst::FCMP_OLT: op = "olt"; break;
        case llvm::FCmpInst::FCMP_OLE: op = "ole"; break;
        case llvm::FCmpInst::FCMP_OGT: op = "ogt"; break;
        case llvm::FCmpInst::FCMP_OGE: op = "oge"; break;
        }
        Out << "llvm_fcmp_" << op << "(";
        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
        Out << ", ";
        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
        Out << ")";
      }
      if (NeedsClosingParens)
        Out << "))";
      Out << ')';
      return;
    }
    default:
#ifndef NDEBUG
      llvm::errs() << "CWriter Error: Unhandled constant expression: "
           << *CE << "\n";
#endif
      llvm_unreachable(0);
    }
  } else if (llvm::isa<llvm::UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
    if (CPV->getType()->isVectorTy()) {
      printType(Out, CPV->getType());
      Out << "( /* UNDEF */)";
      return;
    }

    Out << "((";
    printType(Out, CPV->getType()); // sign doesn't matter
    Out << ")/*UNDEF*/";
    Out << "0)";
    return;
  }

  if (llvm::ConstantInt *CI = llvm::dyn_cast<llvm::ConstantInt>(CPV)) {
    llvm::Type* Ty = CI->getType();
    if (Ty == llvm::Type::getInt1Ty(CPV->getContext()))
      Out << (CI->getZExtValue() ? '1' : '0');
    else if (Ty == llvm::Type::getInt32Ty(CPV->getContext()))
      Out << CI->getZExtValue() << 'u';
    else if (Ty == llvm::Type::getInt64Ty(CPV->getContext()))
      Out << CI->getZExtValue() << "ull";
    else if (Ty->getPrimitiveSizeInBits() > 64) {
      Out << "\"";
      //const uint64_t *Ptr64 = CPV->getUniqueInteger().getRawData();
      const uint64_t *Ptr64 = CI->getValue().getRawData();
      for (int i = 0; i < Ty->getPrimitiveSizeInBits(); i++) {
        Out << ((Ptr64[i / (sizeof (uint64_t) * 8)] >> (i % (sizeof (uint64_t) * 8))) & 1);
      }
      Out << "\"";
    }
    else {
      Out << "((";
      printSimpleType(Out, Ty, false) << ')';
      if (CI->isMinValue(true))
        Out << CI->getZExtValue() << 'u';
      else
        Out << CI->getSExtValue();
      Out << ')';
    }
    return;
  }

  switch (CPV->getType()->getTypeID()) {
  case llvm::Type::FloatTyID:
  case llvm::Type::DoubleTyID:
  case llvm::Type::X86_FP80TyID:
  case llvm::Type::PPC_FP128TyID:
  case llvm::Type::FP128TyID: {
    llvm::ConstantFP *FPC = llvm::cast<llvm::ConstantFP>(CPV);
    std::map<const llvm::ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
    if (I != FPConstantMap.end()) {
      // Because of FP precision problems we must load from a stack allocated
      // value that holds the value in hex.
      Out << "(*(" << (FPC->getType() == llvm::Type::getFloatTy(CPV->getContext()) ?
                       "float" :
                       FPC->getType() == llvm::Type::getDoubleTy(CPV->getContext()) ?
                       "double" :
                       "long double")
          << "*)&FPConstant" << I->second << ')';
    } else {
      double V;
      if (FPC->getType() == llvm::Type::getFloatTy(CPV->getContext()))
        V = FPC->getValueAPF().convertToFloat();
      else if (FPC->getType() == llvm::Type::getDoubleTy(CPV->getContext()))
        V = FPC->getValueAPF().convertToDouble();
      else {
        // Long double.  Convert the number to double, discarding precision.
        // This is not awesome, but it at least makes the CBE output somewhat
        // useful.
        llvm::APFloat Tmp = FPC->getValueAPF();
        bool LosesInfo;
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
        Tmp.convert(llvm::APFloat::IEEEdouble, llvm::APFloat::rmTowardZero, &LosesInfo);
#else // LLVM 4.0+
        Tmp.convert(llvm::APFloat::IEEEdouble(), llvm::APFloat::rmTowardZero, &LosesInfo);
#endif
        V = Tmp.convertToDouble();
      }

      if (std::isnan(V)) {
        // The value is NaN

        // FIXME the actual NaN bits should be emitted.
        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
        // it's 0x7ff4.
        const unsigned long QuietNaN = 0x7ff8UL;
        //const unsigned long SignalNaN = 0x7ff4UL;

        // We need to grab the first part of the FP #
        char Buffer[100];

        uint64_t ll = llvm::DoubleToBits(V);
        sprintf(Buffer, "0x%" PRIx64, ll);

        std::string Num(&Buffer[0], &Buffer[6]);
        unsigned long Val = strtoul(Num.c_str(), 0, 16);

        if (FPC->getType() == llvm::Type::getFloatTy(FPC->getContext()))
          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
              << Buffer << "\") /*nan*/ ";
        else
          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
              << Buffer << "\") /*nan*/ ";
      } else if (std::isinf(V)) {
        // The value is Inf
        if (V < 0) Out << '-';
        Out << "LLVM_INF" <<
            (FPC->getType() == llvm::Type::getFloatTy(FPC->getContext()) ? "F" : "")
            << " /*inf*/ ";
      } else {
        std::string Num;
#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
        // Print out the constant as a floating point number.
        char Buffer[100];
        sprintf(Buffer, "%a", V);
        Num = Buffer;
#else
        Num = ftostr(FPC->getValueAPF());
#endif
       Out << Num;
      }
    }
    break;
  }

  case llvm::Type::ArrayTyID: {
    llvm::ArrayType *AT = llvm::cast<llvm::ArrayType>(CPV->getType());
    if (Static)
      // arrays are wrapped in structs...
      Out << "{ ";
    else {
      // call init func of the struct it's wrapped in...
      printType(Out, CPV->getType());
      Out << "::init (";
    }
    if (llvm::ConstantArray *CA = llvm::dyn_cast<llvm::ConstantArray>(CPV)) {
      printConstantArray(CA, Static);
    } else if (llvm::ConstantDataSequential *CDS =
               llvm::dyn_cast<llvm::ConstantDataSequential>(CPV)) {
      printConstantDataSequential(CDS, Static);
    } else {
      assert(llvm::isa<llvm::ConstantAggregateZero>(CPV) || llvm::isa<llvm::UndefValue>(CPV));
      if (AT->getNumElements()) {
        Out << ' ';
        llvm::Constant *CZ = llvm::Constant::getNullValue(AT->getElementType());
        printConstant(CZ, Static);
        for (unsigned i = 1, e = (unsigned)AT->getNumElements(); i != e; ++i) {
          Out << ", ";
          printConstant(CZ, Static);
        }
      }
    }
    if (Static)
        Out << " }";
    else
        Out << ")";
    break;
  }
  case llvm::Type::VectorTyID: {
    llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(CPV->getType());

    if (llvm::isa<llvm::ConstantAggregateZero>(CPV)) {
        // All zeros; call the __setzero_* function.
        const char *setZeroFunc = lGetTypedFunc("setzero", VT->getElementType(), vectorWidth);
        assert(setZeroFunc != NULL);
        Out << setZeroFunc << "()";
    }
    else if (llvm::isa<llvm::UndefValue>(CPV)) {
        // Undefined value; call __undef_* so that we can potentially pass
        // this information along..
        const char *undefFunc = lGetTypedFunc("undef", VT->getElementType(), vectorWidth);
        assert(undefFunc != NULL);
        Out << undefFunc << "()";
    }
    else {
        const char *smearFunc = lGetTypedFunc("smear", VT->getElementType(), vectorWidth);

        if (llvm::ConstantVector *CV = llvm::dyn_cast<llvm::ConstantVector>(CPV)) {
            llvm::Constant *splatValue = CV->getSplatValue();
            if (splatValue != NULL && smearFunc != NULL) {
                // If it's a basic type and has a __smear_* function, then
                // call that.
                Out << smearFunc << "(";
                printConstant(splatValue, Static);
                Out << ")";
            }
            else {
                // Otherwise call the constructor for the type
                printType(Out, CPV->getType());
                Out << "(";
                printConstantVector(CV, Static);
                Out << ")";
            }
        }
        else if (llvm::ConstantDataVector *CDV =
                 llvm::dyn_cast<llvm::ConstantDataVector>(CPV)) {
            llvm::Constant *splatValue = CDV->getSplatValue();
            if (splatValue != NULL && smearFunc != NULL) {
                Out << smearFunc << "(";
                printConstant(splatValue, Static);
                Out << ")";
            }
            else if (VectorConstantMap.find(CDV) != VectorConstantMap.end()) {
                // If we have emitted an static const array with the
                // vector's values, just load from it.
                unsigned index = VectorConstantMap[CDV];
                int alignment = 4 * std::min(vectorWidth, 16);

                Out << "__load<" << alignment << ">(";

                // Cast the pointer to the array of element values to a
                // pointer to the vector type.
                Out << "(const ";
                printSimpleType(Out, CDV->getType(), true, "");
                Out << " *)";

                Out << "(VectorConstant" << index << "))";
            }
            else {
                printType(Out, CPV->getType());
                Out << "(";
                printConstantDataSequential(CDV, Static);
                Out << ")";
            }
        }
        else {
            llvm::report_fatal_error("Unexpected vector type");
        }
    }

    break;
  }
  case llvm::Type::StructTyID:
    if (!Static) {
      // call init func...
      printType(Out, CPV->getType());
      Out << "::init";
    }
    if (llvm::isa<llvm::ConstantAggregateZero>(CPV) || llvm::isa<llvm::UndefValue>(CPV)) {
      llvm::StructType *ST = llvm::cast<llvm::StructType>(CPV->getType());
      Out << '(';
      if (ST->getNumElements()) {
        Out << ' ';
        printConstant(llvm::Constant::getNullValue(ST->getElementType(0)), Static);
        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
          Out << ", ";
          printConstant(llvm::Constant::getNullValue(ST->getElementType(i)), Static);
        }
      }
      Out << ')';
    } else {
      Out << '(';
      if (CPV->getNumOperands()) {
        // It is a kludge. It is needed because we cannot support short vectors
        // when generating code for knl-generic in multitarget mode.
        // Short vectors are mapped to "native" vectors and cause AVX-512 code
        // generation in static block initialization (__vec16_* in ::init function).
        bool isGenericKNL = g->target->getISA() == Target::GENERIC &&
                                                   !g->target->getTreatGenericAsSmth().empty() &&
                                                   g->mangleFunctionsWithTarget;
        if (isGenericKNL && CPV->getOperand(0)->getType()->isVectorTy())
          llvm::report_fatal_error("knl-generic-* target doesn's support short vectors");
        Out << ' ';
        printConstant(llvm::cast<llvm::Constant>(CPV->getOperand(0)), Static);
        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
          Out << ", ";
          if (isGenericKNL && CPV->getOperand(i)->getType()->isVectorTy())
            llvm::report_fatal_error("knl-generic-* target doesn's support short vectors");
          printConstant(llvm::cast<llvm::Constant>(CPV->getOperand(i)), Static);
        }
      }
      Out << ')';
    }
    break;

  case llvm::Type::PointerTyID:
    if (llvm::isa<llvm::ConstantPointerNull>(CPV)) {
      Out << "((";
      printType(Out, CPV->getType()); // sign doesn't matter
      Out << ")/*NULL*/0)";
      break;
    } else if (llvm::GlobalValue *GV = llvm::dyn_cast<llvm::GlobalValue>(CPV)) {
      writeOperand(GV, Static);
      break;
    }
    // FALL THROUGH
  default:
#ifndef NDEBUG
    llvm::errs() << "Unknown constant type: " << *CPV << "\n";
#endif
    llvm_unreachable(0);
  }
}

// Some constant expressions need to be casted back to the original types
// because their operands were casted to the expected type. This function takes
// care of detecting that case and printing the cast for the ConstantExpr.
bool CWriter::printConstExprCast(const llvm::ConstantExpr* CE, bool Static) {
  bool NeedsExplicitCast = false;
  llvm::Type *Ty = CE->getOperand(0)->getType();
  bool TypeIsSigned = false;
  switch (CE->getOpcode()) {
  case llvm::Instruction::Add:
  case llvm::Instruction::Sub:
  case llvm::Instruction::Mul:
    // We need to cast integer arithmetic so that it is always performed
    // as unsigned, to avoid undefined behavior on overflow.
  case llvm::Instruction::LShr:
  case llvm::Instruction::URem:
  case llvm::Instruction::UDiv: NeedsExplicitCast = true; break;
  case llvm::Instruction::AShr:
  case llvm::Instruction::SRem:
  case llvm::Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
  case llvm::Instruction::SExt:
    Ty = CE->getType();
    NeedsExplicitCast = true;
    TypeIsSigned = true;
    break;
  case llvm::Instruction::ZExt:
  case llvm::Instruction::Trunc:
  case llvm::Instruction::FPTrunc:
  case llvm::Instruction::FPExt:
  case llvm::Instruction::UIToFP:
  case llvm::Instruction::SIToFP:
  case llvm::Instruction::FPToUI:
  case llvm::Instruction::FPToSI:
  case llvm::Instruction::PtrToInt:
  case llvm::Instruction::IntToPtr:
  case llvm::Instruction::BitCast:
    Ty = CE->getType();
    NeedsExplicitCast = true;
    break;
  default: break;
  }
  if (NeedsExplicitCast) {
    Out << "((";
    if (Ty->isIntegerTy() && Ty != llvm::Type::getInt1Ty(Ty->getContext()))
      printSimpleType(Out, Ty, TypeIsSigned);
    else
      printType(Out, Ty); // not integer, sign doesn't matter
    Out << ")(";
  }
  return NeedsExplicitCast;
}

//  Print a constant assuming that it is the operand for a given Opcode. The
//  opcodes that care about sign need to cast their operands to the expected
//  type before the operation proceeds. This function does the casting.
void CWriter::printConstantWithCast(llvm::Constant* CPV, unsigned Opcode) {

  // Extract the operand's type, we'll need it.
  llvm::Type* OpTy = CPV->getType();

  // Indicate whether to do the cast or not.
  bool shouldCast = false;
  bool typeIsSigned = false;

  // Based on the Opcode for which this Constant is being written, determine
  // the new type to which the operand should be casted by setting the value
  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
  // casted below.
  switch (Opcode) {
    default:
      // for most instructions, it doesn't matter
      break;
    case llvm::Instruction::Add:
    case llvm::Instruction::Sub:
    case llvm::Instruction::Mul:
      // We need to cast integer arithmetic so that it is always performed
      // as unsigned, to avoid undefined behavior on overflow.
    case llvm::Instruction::LShr:
    case llvm::Instruction::UDiv:
    case llvm::Instruction::URem:
      shouldCast = true;
      break;
    case llvm::Instruction::AShr:
    case llvm::Instruction::SDiv:
    case llvm::Instruction::SRem:
      shouldCast = true;
      typeIsSigned = true;
      break;
  }

  // Write out the casted constant if we should, otherwise just write the
  // operand.
  if (shouldCast) {
    Out << "((";
    printSimpleType(Out, OpTy, typeIsSigned);
    Out << ")";
    printConstant(CPV, false);
    Out << ")";
  } else
    printConstant(CPV, false);
}

std::string CWriter::GetValueName(const llvm::Value *Operand) {

  // Resolve potential alias.
  if (const llvm::GlobalAlias *GA = llvm::dyn_cast<llvm::GlobalAlias>(Operand)) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 /* LLVM 3.5+ */
    if (const llvm::Value *V = GA->getAliasee())
#else /* <= LLVM 3.4 */
    if (const llvm::Value *V = GA->resolveAliasedGlobal(false))
#endif
      Operand = V;
  }

  // Mangle globals with the standard mangler interface for LLC compatibility.
  if (const llvm::GlobalValue *GV = llvm::dyn_cast<llvm::GlobalValue>(Operand)) {
    (void)GV;
    //llvm::SmallString<128> Str;
    //Mang->getNameWithPrefix(Str, GV, false);
    //return CBEMangle(Str.str().str());
    return CBEMangle(Operand->getName().str().c_str());
  }

  std::string Name = Operand->getName();

  if (Name.empty()) { // Assign unique names to local temporaries.
    unsigned &No = AnonValueNumbers[Operand];
    if (No == 0)
      No = ++NextAnonValueNumber;
    Name = "tmp__" + llvm::utostr(No);
  }

  std::string VarName;
  VarName.reserve(Name.capacity());

  for (std::string::iterator I = Name.begin(), E = Name.end();
       I != E; ++I) {
    char ch = *I;

    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
          (ch >= '0' && ch <= '9') || ch == '_')) {
      char buffer[5];
      sprintf(buffer, "_%x_", ch);
      VarName += buffer;
    } else
      VarName += ch;
  }

  if (llvm::isa<llvm::BasicBlock>(Operand))
    VarName += "_label";
  else
    VarName += "_";

  return VarName;
}

/// writeInstComputationInline - Emit the computation for the specified
/// instruction inline, with no destination provided.
void CWriter::writeInstComputationInline(llvm::Instruction &I) {
  // If this is a non-trivial bool computation, make sure to truncate down to
  // a 1 bit value.  This is important because we want "add i1 x, y" to return
  // "0" when x and y are true, not "2" for example.
  bool NeedBoolTrunc = false;
  if (I.getType() == llvm::Type::getInt1Ty(I.getContext()) &&
      !llvm::isa<llvm::ICmpInst>(I) && !llvm::isa<llvm::FCmpInst>(I))
    NeedBoolTrunc = true;

  if (NeedBoolTrunc)
    Out << "((";

  visit(I);

  if (NeedBoolTrunc)
    Out << ")&1)";
}


void CWriter::writeOperandInternal(llvm::Value *Operand, bool Static) {
  if (llvm::Instruction *I = llvm::dyn_cast<llvm::Instruction>(Operand))
    // Should we inline this instruction to build a tree?
    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
      Out << '(';
      writeInstComputationInline(*I);
      Out << ')';
      return;
    }

  llvm::Constant* CPV = llvm::dyn_cast<llvm::Constant>(Operand);

  if (CPV && !llvm::isa<llvm::GlobalValue>(CPV))
    printConstant(CPV, Static);
  else
    Out << GetValueName(Operand);
}

void CWriter::writeOperand(llvm::Value *Operand, bool Static) {
  bool isAddressImplicit = isAddressExposed(Operand);
  if (isAddressImplicit)
    Out << "(&";  // Global variables are referenced as their addresses by llvm

  writeOperandInternal(Operand, Static);

  if (isAddressImplicit)
    Out << ')';
}

// Some instructions need to have their result value casted back to the
// original types because their operands were casted to the expected type.
// This function takes care of detecting that case and printing the cast
// for the Instruction.
bool CWriter::writeInstructionCast(const llvm::Instruction &I) {
  llvm::Type *Ty = I.getOperand(0)->getType();
  switch (I.getOpcode()) {
  case llvm::Instruction::Add:
  case llvm::Instruction::Sub:
  case llvm::Instruction::Mul:
    // We need to cast integer arithmetic so that it is always performed
    // as unsigned, to avoid undefined behavior on overflow.
  case llvm::Instruction::LShr:
  case llvm::Instruction::URem:
  case llvm::Instruction::UDiv:
    Out << "((";
    printSimpleType(Out, Ty, false);
    Out << ")(";
    return true;
  case llvm::Instruction::AShr:
  case llvm::Instruction::SRem:
  case llvm::Instruction::SDiv:
    Out << "((";
    printSimpleType(Out, Ty, true);
    Out << ")(";
    return true;
  default: break;
  }
  return false;
}

// Write the operand with a cast to another type based on the Opcode being used.
// This will be used in cases where an instruction has specific type
// requirements (usually signedness) for its operands.
void CWriter::writeOperandWithCast(llvm::Value* Operand, unsigned Opcode) {

  // Extract the operand's type, we'll need it.
  llvm::Type* OpTy = Operand->getType();

  // Indicate whether to do the cast or not.
  bool shouldCast = false;

  // Indicate whether the cast should be to a signed type or not.
  bool castIsSigned = false;

  // Based on the Opcode for which this Operand is being written, determine
  // the new type to which the operand should be casted by setting the value
  // of OpTy. If we change OpTy, also set shouldCast to true.
  switch (Opcode) {
    default:
      // for most instructions, it doesn't matter
      break;
    case llvm::Instruction::Add:
    case llvm::Instruction::Sub:
    case llvm::Instruction::Mul:
      // We need to cast integer arithmetic so that it is always performed
      // as unsigned, to avoid undefined behavior on overflow.
    case llvm::Instruction::LShr:
    case llvm::Instruction::UDiv:
    case llvm::Instruction::URem: // Cast to unsigned first
      shouldCast = true;
      castIsSigned = false;
      break;
    case llvm::Instruction::GetElementPtr:
    case llvm::Instruction::AShr:
    case llvm::Instruction::SDiv:
    case llvm::Instruction::SRem: // Cast to signed first
      shouldCast = true;
      castIsSigned = true;
      break;
  }

  // Write out the casted operand if we should, otherwise just write the
  // operand.
  if (shouldCast) {
    Out << "((";
    printSimpleType(Out, OpTy, castIsSigned);
    Out << ")";
    writeOperand(Operand);
    Out << ")";
  } else
    writeOperand(Operand);
}

// Write the operand with a cast to another type based on the icmp predicate
// being used.
void CWriter::writeOperandWithCast(llvm::Value* Operand, const llvm::ICmpInst &Cmp) {
  // This has to do a cast to ensure the operand has the right signedness.
  // Also, if the operand is a pointer, we make sure to cast to an integer when
  // doing the comparison both for signedness and so that the C compiler doesn't
  // optimize things like "p < NULL" to false (p may contain an integer value
  // f.e.).
  bool shouldCast = Cmp.isRelational();

  // Write out the casted operand if we should, otherwise just write the
  // operand.
  if (!shouldCast) {
    writeOperand(Operand);
    return;
  }

  // Should this be a signed comparison?  If so, convert to signed.
  bool castIsSigned = Cmp.isSigned();

  // If the operand was a pointer, convert to a large integer type.
  llvm::Type* OpTy = Operand->getType();
  if (OpTy->isPointerTy())
    OpTy = TD->getIntPtrType(Operand->getContext());

  Out << "((";
  printSimpleType(Out, OpTy, castIsSigned);
  Out << ")";
  writeOperand(Operand);
  Out << ")";
}

// generateCompilerSpecificCode - This is where we add conditional compilation
// directives to cater to specific compilers as need be.
//
static void generateCompilerSpecificCode(llvm::formatted_raw_ostream& Out,
                                         const llvm::DataLayout *TD) {
  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
  // If we aren't being compiled with GCC, just drop these attributes.
  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
      << "#define __attribute__(X)\n"
      << "#endif\n\n";

  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
      << "#elif defined(__GNUC__)\n"
      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
      << "#else\n"
      << "#define __EXTERNAL_WEAK__\n"
      << "#endif\n\n";

  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
      << "#define __ATTRIBUTE_WEAK__\n"
      << "#elif defined(__GNUC__)\n"
      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
      << "#else\n"
      << "#define __ATTRIBUTE_WEAK__\n"
      << "#endif\n\n";

  // Add hidden visibility support. FIXME: APPLE_CC?
  Out << "#if defined(__GNUC__)\n"
      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
      << "#endif\n\n";

  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
  // From the GCC documentation:
  //
  //   double __builtin_nan (const char *str)
  //
  // This is an implementation of the ISO C99 function nan.
  //
  // Since ISO C99 defines this function in terms of strtod, which we do
  // not implement, a description of the parsing is in order. The string is
  // parsed as by strtol; that is, the base is recognized by leading 0 or
  // 0x prefixes. The number parsed is placed in the significand such that
  // the least significant bit of the number is at the least significant
  // bit of the significand. The number is truncated to fit the significand
  // field provided. The significand is forced to be a quiet NaN.
  //
  // This function, if given a string literal, is evaluated early enough
  // that it is considered a compile-time constant.
  //
  //   float __builtin_nanf (const char *str)
  //
  // Similar to __builtin_nan, except the return type is float.
  //
  //   double __builtin_inf (void)
  //
  // Similar to __builtin_huge_val, except a warning is generated if the
  // target floating-point format does not support infinities. This
  // function is suitable for implementing the ISO C99 macro INFINITY.
  //
  //   float __builtin_inff (void)
  //
  // Similar to __builtin_inf, except the return type is float.
  Out << "#if (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER)\n"
      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
      << "//#define LLVM_PREFETCH(addr,rw,locality) "
                              "__builtin_prefetch(addr,rw,locality)\n"
      << "//#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
      << "//#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
      << "#elif defined(_MSC_VER) || defined(__INTEL_COMPILER)\n"
      << "#include <limits>\n"
      << "#define LLVM_NAN(NanStr)   std::numeric_limits<double>::quiet_NaN()\n"
      << "#define LLVM_NANF(NanStr)  std::numeric_limits<float>::quiet_NaN()\n"
      << "#define LLVM_NANS(NanStr)  std::numeric_limits<double>::signaling_NaN()\n"
      << "#define LLVM_NANSF(NanStr) std::numeric_limits<float>::signaling_NaN()\n"
      << "#define LLVM_INF           std::numeric_limits<double>::infinity()\n"
      << "#define LLVM_INFF          std::numeric_limits<float>::infinity()\n"
      << "//#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
      << "//#define __ATTRIBUTE_CTOR__\n"
      << "//#define __ATTRIBUTE_DTOR__\n"
      << "#else\n"
      << "#error \"Not MSVC, clang, or g++?\"\n"
      << "#endif\n\n";

  // LLVM_ASM() is used to define mapping of the symbol to a different name,
  // this is expected to be MacOS-only feature. So defining it only for
  // gcc and clang (Intel Compiler on Linux/MacOS is also ok).
  // For example, this feature is required to translate symbols described in
  // "Symbol Variants Release Notes" document (on Apple website).
  Out << "#if (defined(__GNUC__) || defined(__clang__))\n"
      << "#define LLVM_ASM(X) __asm(X)\n"
      << "#endif\n\n";

  Out << "#if defined(__clang__) || defined(__INTEL_COMPILER) || "
             "(__GNUC__ < 4) /* Old GCCs, or compilers not GCC */ \n"
      << "#define __builtin_stack_save() 0   /* not implemented */\n"
      << "#define __builtin_stack_restore(X) /* noop */\n"
      << "#endif\n\n";

#if 0
  // Output typedefs for 128-bit integers. If these are needed with a
  // 32-bit target or with a C compiler that doesn't support mode(TI),
  // more drastic measures will be needed.
  Out << "#if __GNUC__ && __LP64__ /* 128-bit integer types */\n"
      << "typedef int __attribute__((mode(TI))) llvmInt128;\n"
      << "typedef unsigned __attribute__((mode(TI))) llvmUInt128;\n"
      << "#endif\n\n";
#endif

  // Output target-specific code that should be inserted into main.
  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
}

/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
/// the StaticTors set.
static void FindStaticTors(llvm::GlobalVariable *GV, std::set<llvm::Function*> &StaticTors){
  llvm::ConstantArray *InitList = llvm::dyn_cast<llvm::ConstantArray>(GV->getInitializer());
  if (!InitList) return;

  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
    if (llvm::ConstantStruct *CS = llvm::dyn_cast<llvm::ConstantStruct>(InitList->getOperand(i))){
      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.

      if (CS->getOperand(1)->isNullValue())
        return;  // Found a null terminator, exit printing.
      llvm::Constant *FP = CS->getOperand(1);
      if (llvm::ConstantExpr *CE = llvm::dyn_cast<llvm::ConstantExpr>(FP))
        if (CE->isCast())
          FP = CE->getOperand(0);
      if (llvm::Function *F = llvm::dyn_cast<llvm::Function>(FP))
        StaticTors.insert(F);
    }
}

enum SpecialGlobalClass {
  NotSpecial = 0,
  GlobalCtors, GlobalDtors,
  NotPrinted
};

/// getGlobalVariableClass - If this is a global that is specially recognized
/// by LLVM, return a code that indicates how we should handle it.
static SpecialGlobalClass getGlobalVariableClass(const llvm::GlobalVariable *GV) {
  // If this is a global ctors/dtors list, handle it now.
  if (GV->hasAppendingLinkage() && GV->use_empty()) {
    if (GV->getName() == "llvm.global_ctors")
      return GlobalCtors;
    else if (GV->getName() == "llvm.global_dtors")
      return GlobalDtors;
  }

  // Otherwise, if it is other metadata, don't print it.  This catches things
  // like debug information.
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 && ISPC_LLVM_VERSION <= ISPC_LLVM_3_8 /* LLVM 3.5-3.8 */
  // Here we compare char *
  if (!strcmp(GV->getSection(), "llvm.metadata"))
#else
  // Here we compare strings
  if (GV->getSection() == "llvm.metadata")
#endif
    return NotPrinted;

  return NotSpecial;
}

// PrintEscapedString - Print each character of the specified string, escaping
// it if it is not printable or if it is an escape char.
static void PrintEscapedString(const char *Str, unsigned Length,
                               llvm::raw_ostream &Out) {
  for (unsigned i = 0; i != Length; ++i) {
    unsigned char C = Str[i];
    if (isprint(C) && C != '\\' && C != '"')
      Out << C;
    else if (C == '\\')
      Out << "\\\\";
    else if (C == '\"')
      Out << "\\\"";
    else if (C == '\t')
      Out << "\\t";
    else
      Out << "\\x" << llvm::hexdigit(C >> 4) << llvm::hexdigit(C & 0x0F);
  }
}

// PrintEscapedString - Print each character of the specified string, escaping
// it if it is not printable or if it is an escape char.
static void PrintEscapedString(const std::string &Str, llvm::raw_ostream &Out) {
  PrintEscapedString(Str.c_str(), Str.size(), Out);
}

bool CWriter::doInitialization(llvm::Module &M) {
  llvm::FunctionPass::doInitialization(M);

  // Initialize
  TheModule = &M;

  TD = new llvm::DataLayout(&M);
  IL = new llvm::IntrinsicLowering(*TD);
  IL->AddPrototypes(M);

#if 0
  std::string Triple = TheModule->getTargetTriple();
  if (Triple.empty())
    Triple = llvm::sys::getDefaultTargetTriple();

  std::string E;
  if (const llvm::Target *Match = llvm::TargetRegistry::lookupTarget(Triple, E))
    TAsm = Match->createMCAsmInfo(Triple);
#endif
  TAsm = new CBEMCAsmInfo();
  MRI  = new llvm::MCRegisterInfo();
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_4 // LLVM 3.4+
  TCtx = new llvm::MCContext(TAsm, MRI, NULL);
#else
  TCtx = new llvm::MCContext(*TAsm, *MRI, NULL);
#endif
  //Mang = new llvm::Mangler(*TCtx, *TD);

  // Keep track of which functions are static ctors/dtors so they can have
  // an attribute added to their prototypes.
  std::set<llvm::Function*> StaticCtors, StaticDtors;
  for (llvm::Module::global_iterator I = M.global_begin(), E = M.global_end();
       I != E; ++I) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
    switch (getGlobalVariableClass(I)) {
#else /* LLVM 3.8+ */
    switch (getGlobalVariableClass(&*I)) {
#endif
    default: break;
    case GlobalCtors:
      FindStaticTors(&*I, StaticCtors);
      break;
    case GlobalDtors:
      FindStaticTors(&*I, StaticDtors);
      break;
    }
  }

  Out << "/*******************************************************************\n";
  Out << "  This file has been automatically generated by ispc\n";
  Out << "  DO NOT EDIT THIS FILE DIRECTLY\n";
  Out << " *******************************************************************/\n\n";

  Out << "/* Provide Declarations */\n";
  Out << "#include <stdarg.h>\n";      // Varargs support
  Out << "#include <setjmp.h>\n";      // Unwind support
  Out << "#include <limits.h>\n";      // With overflow intrinsics support.
  Out << "#include <stdlib.h>\n";
  Out << "#ifdef _MSC_VER\n";
  Out << "  #define NOMINMAX\n";
  Out << "  #include <windows.h>\n";
  Out << "#endif // _MSC_VER\n";
  Out << "#include <stdlib.h>\n";
  Out << "#include <stdint.h>\n";
  Out << "/* get a declaration for alloca */\n";
  Out << "#ifdef _MSC_VER\n";
  Out << "  #include <malloc.h>\n";
  Out << "  #define alloca _alloca\n";
  Out << "#else\n";
  Out << "  #include <alloca.h>\n";
  Out << "#endif\n\n";

  if (g->opt.fastMath) {
      Out << "#define ISPC_FAST_MATH 1\n";
  } else {
      Out << "#undef ISPC_FAST_MATH\n";
  }

  if (g->opt.forceAlignedMemory) {
      Out << "#define ISPC_FORCE_ALIGNED_MEMORY\n";
  }

  Out << "#include \"" << includeName << "\"\n";

  Out << "\n/* Basic Library Function Declarations */\n";
  Out << "extern \"C\" {\n";
  Out << "int puts(unsigned char *);\n";
  Out << "unsigned int putchar(unsigned int);\n";
  Out << "int fflush(void *);\n";
  Out << "int printf(const unsigned char *, ...);\n";
  Out << "uint8_t *memcpy(uint8_t *, uint8_t *, uint64_t );\n";
  Out << "uint8_t *memset(uint8_t *, uint8_t, uint64_t );\n";
  Out << "void memset_pattern16(void *, const void *, uint64_t );\n";
  Out << "}\n\n";

  generateCompilerSpecificCode(Out, TD);

  // Provide a definition for `bool' if not compiling with a C++ compiler.
  Out << "\n"
      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"

      << "\n\n/* Support for floating point constants */\n"
      << "typedef uint64_t ConstantDoubleTy;\n"
      << "typedef uint32_t ConstantFloatTy;\n"
      << "typedef struct { unsigned long long f1; unsigned short f2; "
         "unsigned short pad[3]; } ConstantFP80Ty;\n"
      // This is used for both kinds of 128-bit long double; meaning differs.
      << "typedef struct { uint64_t f1, f2; } ConstantFP128Ty;\n"
      << "\n\n/* Global Declarations */\n\n";

  // First output all the declarations for the program, because C requires
  // Functions & globals to be declared before they are used.
  //
  if (!M.getModuleInlineAsm().empty()) {
    Out << "/* Module asm statements */\n"
        << "asm(";

    // Split the string into lines, to make it easier to read the .ll file.
    std::string Asm = M.getModuleInlineAsm();
    size_t CurPos = 0;
    size_t NewLine = Asm.find_first_of('\n', CurPos);
    while (NewLine != std::string::npos) {
      // We found a newline, print the portion of the asm string from the
      // last newline up to this newline.
      Out << "\"";
      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
                         Out);
      Out << "\\n\"\n";
      CurPos = NewLine+1;
      NewLine = Asm.find_first_of('\n', CurPos);
    }
    Out << "\"";
    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
    Out << "\");\n"
        << "/* End Module asm statements */\n";
  }

  // Loop over the symbol table, emitting all named constants.
  printModuleTypes();

  // Global variable declarations...
  if (!M.global_empty()) {
    Out << "\n/* External Global Variable Declarations */\n";
    for (llvm::Module::global_iterator I = M.global_begin(), E = M.global_end();
         I != E; ++I) {

      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage() ||
          I->hasCommonLinkage())
        Out << "extern ";
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
      else if (I->hasDLLImportStorageClass())
#else
      else if (I->hasDLLImportLinkage())
#endif
        Out << "__declspec(dllimport) ";
      else
        continue; // Internal Global

      // Thread Local Storage
      if (I->isThreadLocal())
        Out << "__thread ";

      printType(Out, I->getType()->getElementType(), false, GetValueName(&*I));

      if (I->hasExternalWeakLinkage())
         Out << " __EXTERNAL_WEAK__";
      Out << ";\n";
    }
  }

  // Output the global variable declarations
  if (!M.global_empty()) {
    Out << "\n\n/* Global Variable Declarations */\n";
    for (llvm::Module::global_iterator I = M.global_begin(), E = M.global_end();
         I != E; ++I)
      if (!I->isDeclaration()) {
        // Ignore special globals, such as debug info.
        if (getGlobalVariableClass(&*I))
          continue;

        if (I->hasLocalLinkage())
          continue;
        else
          Out << "extern ";

        // Thread Local Storage
        if (I->isThreadLocal())
          Out << "__thread ";

        printType(Out, I->getType()->getElementType(), false,
                  GetValueName(&*I));

        if (I->hasLinkOnceLinkage())
          Out << " __attribute__((common))";
        else if (I->hasCommonLinkage())     // FIXME is this right?
          Out << " __ATTRIBUTE_WEAK__";
        else if (I->hasWeakLinkage())
          Out << " __ATTRIBUTE_WEAK__";
        else if (I->hasExternalWeakLinkage())
          Out << " __EXTERNAL_WEAK__";
        if (I->hasHiddenVisibility())
          Out << " __HIDDEN__";
        Out << ";\n";
      }
  }

  // Function declarations
  Out << "\n/* Function Declarations */\n";
  Out << "extern \"C\" {\n";

  // Store the intrinsics which will be declared/defined below.
  llvm::SmallVector<const llvm::Function*, 8> intrinsicsToDefine;

  for (llvm::Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
    // Don't print declarations for intrinsic functions.
    // Store the used intrinsics, which need to be explicitly defined.
    if (I->isIntrinsic()) {
      switch (I->getIntrinsicID()) {
        default:
          break;
        case llvm::Intrinsic::uadd_with_overflow:
        case llvm::Intrinsic::sadd_with_overflow:
        case llvm::Intrinsic::umul_with_overflow:
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
          intrinsicsToDefine.push_back(I);
#else /* LLVM 3.8+ */
          intrinsicsToDefine.push_back(&*I);
#endif
          break;
      }
      continue;
    }

    if (I->getName() == "setjmp" || I->getName() == "abort" ||
        I->getName() == "longjmp" || I->getName() == "_setjmp" ||
        I->getName() == "memset" || I->getName() == "memset_pattern16" ||
        I->getName() == "puts" ||
        I->getName() == "printf" || I->getName() == "putchar" ||
        I->getName() == "fflush" ||
        // Memory allocation
        I->getName() == "malloc" ||
        I->getName() == "posix_memalign" ||
        I->getName() == "free" ||
        I->getName() == "_aligned_malloc" ||
        I->getName() == "_aligned_free"
        )
      continue;

    // Don't redeclare ispc's own intrinsics
    std::string name = I->getName();
    if (name.size() > 2 && name[0] == '_' && name[1] == '_')
        continue;

    if (I->hasExternalWeakLinkage())
      Out << "extern ";
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
    printFunctionSignature(I, true);
#else /* LLVM 3.8+ */
    printFunctionSignature(&*I, true);
#endif
    if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
      Out << " __ATTRIBUTE_WEAK__";
    if (I->hasExternalWeakLinkage())
      Out << " __EXTERNAL_WEAK__";
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
    if (StaticCtors.count(I))
      Out << " __ATTRIBUTE_CTOR__";
    if (StaticDtors.count(I))
#else /* LLVM 3.8+ */
    if (StaticCtors.count(&*I))
      Out << " __ATTRIBUTE_CTOR__";
    if (StaticDtors.count(&*I))
#endif
      Out << " __ATTRIBUTE_DTOR__";
    if (I->hasHiddenVisibility())
      Out << " __HIDDEN__";

    // This is MacOS specific feature, this should not appear on other platforms.
    if (I->hasName() && I->getName()[0] == 1)
      Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";

    Out << ";\n";
  }
  Out << "}\n\n";

  if (!M.empty())
    Out << "\n\n/* Function Bodies */\n";

  // Emit some helper functions for dealing with FCMP instruction's
  // predicates
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ord(A X, B Y) { ";
  Out << "return X == X && Y == Y; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_uno(A X, B Y) { ";
  Out << "return X != X || Y != Y; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ueq(A X, B Y) { ";
  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_une(A X, B Y) { ";
  Out << "return X != Y; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ult(A X, B Y) { ";
  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ugt(A X, B Y) { ";
  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ule(A X, B Y) { ";
  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_uge(A X, B Y) { ";
  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_oeq(A X, B Y) { ";
  Out << "return X == Y ; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_one(A X, B Y) { ";
  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_olt(A X, B Y) { ";
  Out << "return X <  Y ; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ogt(A X, B Y) { ";
  Out << "return X >  Y ; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_ole(A X, B Y) { ";
  Out << "return X <= Y ; }\n";
  Out << "template <typename A, typename B> static inline int llvm_fcmp_oge(A X, B Y) { ";
  Out << "return X >= Y ; }\n";
  Out << "template <typename A> A *Memset(A *ptr, int count, size_t len) { ";
  Out << "return (A *)memset(ptr, count, len); }\n";

  // Emit definitions of the intrinsics.
  for (llvm::SmallVector<const llvm::Function*, 8>::const_iterator
       I = intrinsicsToDefine.begin(),
       E = intrinsicsToDefine.end(); I != E; ++I) {
    printIntrinsicDefinition(**I, Out);
  }

  // Output the global variable definitions and contents...
  if (!M.global_empty()) {
    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
    for (llvm::Module::global_iterator I = M.global_begin(), E = M.global_end();
         I != E; ++I)
      if (!I->isDeclaration()) {
        // Ignore special globals, such as debug info.
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
        if (getGlobalVariableClass(I))
#else /* LLVM 3.8+ */
        if (getGlobalVariableClass(&*I))
#endif
          continue;

        if (I->hasLocalLinkage())
          Out << "static ";
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
        else if (I->hasDLLImportStorageClass()) Out << "__declspec(dllimport) ";
        else if (I->hasDLLExportStorageClass()) Out << "__declspec(dllexport) ";
#else
        else if (I->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
        else if (I->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";
#endif
        // Thread Local Storage
        if (I->isThreadLocal())
          Out << "__thread ";

        printType(Out, I->getType()->getElementType(), false,
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
                  GetValueName(I));
#else /* LLVM 3.8+ */
                  GetValueName(&*I));
#endif

        if (I->hasLinkOnceLinkage())
          Out << " __attribute__((common))";
        else if (I->hasWeakLinkage())
          Out << " __ATTRIBUTE_WEAK__";
        else if (I->hasCommonLinkage())
          Out << " __ATTRIBUTE_WEAK__";

        if (I->hasHiddenVisibility())
          Out << " __HIDDEN__";

        // If the initializer is not null, emit the initializer.  If it is null,
        // we try to avoid emitting large amounts of zeros.  The problem with
        // this, however, occurs when the variable has weak linkage.  In this
        // case, the assembler will complain about the variable being both weak
        // and common, so we disable this optimization.
        // FIXME common linkage should avoid this problem.
        if (!I->getInitializer()->isNullValue()) {
          Out << " = " ;

          // vec16_i64 should be handled separately
          if (is_vec16_i64_ty(I->getType()->getElementType())) {
            Out << "/* vec16_i64 should be loaded carefully on knc */\n";
            Out << "\n#if defined(KNC) \n";
            Out << "hilo2zmm";
            Out << "\n#endif \n";
          }

          Out << "(";
          writeOperand(I->getInitializer(), false);
          Out << ")";
        } else if (I->hasWeakLinkage()) {
          // We have to specify an initializer, but it doesn't have to be
          // complete.  If the value is an aggregate, print out { 0 }, and let
          // the compiler figure out the rest of the zeros.
          Out << " = " ;
          if (I->getInitializer()->getType()->isStructTy() ||
              I->getInitializer()->getType()->isVectorTy()) {
            Out << "{ 0 }";
          } else if (I->getInitializer()->getType()->isArrayTy()) {
            // As with structs and vectors, but with an extra set of braces
            // because arrays are wrapped in structs.
            Out << "{ { 0 } }";
          } else {
            // Just print it out normally.
            writeOperand(I->getInitializer(), false);
          }
        }
        Out << ";\n";
      }
  }

  return false;
}


/// Output all floating point constants that cannot be printed accurately...
void CWriter::printFloatingPointConstants(llvm::Function &F) {
  // Scan the module for floating point constants.  If any FP constant is used
  // in the function, we want to redirect it here so that we do not depend on
  // the precision of the printed form, unless the printed form preserves
  // precision.
  //
  for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F),
       E = constant_scanner::constant_end(&F); I != E; ++I)
    printFloatingPointConstants(*I);

  Out << '\n';
}

void CWriter::printFloatingPointConstants(const llvm::Constant *C) {
  // If this is a constant expression, recursively check for constant fp values.
  if (const llvm::ConstantExpr *CE = llvm::dyn_cast<llvm::ConstantExpr>(C)) {
    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
      printFloatingPointConstants(CE->getOperand(i));
    return;
  }

  // Otherwise, check for a FP constant that we need to print.
  const llvm::ConstantFP *FPC = llvm::dyn_cast<llvm::ConstantFP>(C);
  if (FPC == 0 ||
      // Do not put in FPConstantMap if safe.
      isFPCSafeToPrint(FPC) ||
      // Already printed this constant?
      FPConstantMap.count(FPC))
    return;

  FPConstantMap[FPC] = FPCounter;  // Number the FP constants

  if (FPC->getType() == llvm::Type::getDoubleTy(FPC->getContext())) {
    double Val = FPC->getValueAPF().convertToDouble();
    uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
        << " = 0x" << llvm::utohexstr(i)
    << "ULL;    /* " << Val << " */\n";
  } else if (FPC->getType() == llvm::Type::getFloatTy(FPC->getContext())) {
    float Val = FPC->getValueAPF().convertToFloat();
    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
    getZExtValue();
    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
    << " = 0x" << llvm::utohexstr(i)
    << "U;    /* " << Val << " */\n";
  } else if (FPC->getType() == llvm::Type::getX86_FP80Ty(FPC->getContext())) {
    // api needed to prevent premature destruction
    llvm::APInt api = FPC->getValueAPF().bitcastToAPInt();
    const uint64_t *p = api.getRawData();
    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
    << " = { 0x" << llvm::utohexstr(p[0])
    << "ULL, 0x" << llvm::utohexstr((uint16_t)p[1]) << ",{0,0,0}"
    << "}; /* Long double constant */\n";
  } else if (FPC->getType() == llvm::Type::getPPC_FP128Ty(FPC->getContext()) ||
             FPC->getType() == llvm::Type::getFP128Ty(FPC->getContext())) {
    llvm::APInt api = FPC->getValueAPF().bitcastToAPInt();
    const uint64_t *p = api.getRawData();
    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
    << " = { 0x"
    << llvm::utohexstr(p[0]) << ", 0x" << llvm::utohexstr(p[1])
    << "}; /* Long double constant */\n";

  } else {
    llvm_unreachable("Unknown float type!");
  }
}


// For any vector constants, generate code to declare static const arrays
// with their element values.  Doing so allows us to emit aligned vector
// loads to get their values, rather than tediously inserting the
// individual values into the vector.
void CWriter::printVectorConstants(llvm::Function &F) {
    for (constant_scanner::constant_iterator I = constant_scanner::constant_begin(&F),
         E = constant_scanner::constant_end(&F); I != E; ++I) {
        const llvm::ConstantDataVector *CDV = llvm::dyn_cast<llvm::ConstantDataVector>(*I);
        if (CDV == NULL)
            continue;

        // Don't bother if this is a splat of the same value; a (more
        // efficient?) __splat_* call will be generated for these.
        if (CDV->getSplatValue() != NULL)
            continue;

        // Don't align to anything more than 64 bytes
        int alignment = 4 * std::min(vectorWidth, 16);

        Out << "static const ";
        printSimpleType(Out, CDV->getElementType(), true, "");
        Out << "__attribute__ ((aligned(" << alignment << "))) ";
        Out << "VectorConstant" << VectorConstantIndex << "[] = { ";
        for (int i = 0; i < (int)CDV->getNumElements(); ++i) {
            printConstant(CDV->getElementAsConstant(i), false);
            Out << ", ";
        }
        Out << " };\n";

        VectorConstantMap[CDV] = VectorConstantIndex++;
    }
    Out << "\n";
}

/// printSymbolTable - Run through symbol table looking for type names.  If a
/// type name is found, emit its declaration...
///
void CWriter::printModuleTypes() {
  Out << "\n/* Helper union for bitcasts */\n";
  Out << "typedef union {\n";
  Out << "  unsigned int Int32;\n";
  Out << "  unsigned long long Int64;\n";
  Out << "  float Float;\n";
  Out << "  double Double;\n";
  Out << "} llvmBitCastUnion;\n";
  Out << "\n/* This is special class, designed for operations with long int.*/                       \n";
  Out << "namespace {                                                                                \n";
  Out << "template <int num_bits>                                                                    \n";
  Out << "struct iN {                                                                                \n";
  Out << "  int num[num_bits / (sizeof (int) * 8)];                                                  \n";
  Out << "                                                                                           \n";
  Out << "  iN () {}                                                                                 \n";
  Out << "                                                                                           \n";
  Out << "  iN (const char *val) {                                                                   \n";
  Out << "    if (val == NULL)                                                                       \n";
  Out << "      return;                                                                              \n";
  Out << "    int length = num_bits / (sizeof (int) * 8);                                            \n";
  Out << "    int val_len = 0;                                                                       \n";
  Out << "    for (val_len = 0; val[val_len]; (val_len)++);                                          \n";
  Out << "    for (int i = 0; (i < val_len && i < num_bits); i++)                                    \n";
  Out << "      num[i / (sizeof (int) * 8)] = (num[i / (sizeof (int) * 8)] << 1) | (val[i] - '0');   \n";
  Out << "  }                                                                                        \n";
  Out << "                                                                                           \n";
  Out << "  ~iN () {}                                                                                \n";
  Out << "                                                                                           \n";
  Out << "  iN operator >> (const iN rhs) {                                                          \n";
  Out << "    iN res;                                                                                \n";
  Out << "    int length = num_bits / (sizeof (int) * 8);                                            \n";
  Out << "    int cells_shift = rhs.num[0] / (sizeof(int) * 8);                                      \n";
  Out << "    int small_shift = rhs.num[0] % (sizeof(int) * 8);                                      \n";
  Out << "    for (int i = 0; i < (length - cells_shift); i++)                                       \n";
  Out << "      res.num[i] = this->num[cells_shift + i];                                             \n";
  Out << "    for (int i = 0; i < length - 1; i++) {                                                 \n";
  Out << "      res.num[i] = this->num[i] >> small_shift;                                            \n";
  Out << "      res.num[i]  = ((this->num[i + 1] << ((sizeof(int) * 8) - small_shift))) | res.num[i];\n";
  Out << "    }                                                                                      \n";
  Out << "    res.num[length - 1] = res.num[length - 1] >> small_shift;                              \n";
  Out << "    return res;                                                                            \n";
  Out << "  }                                                                                        \n";
  Out << "                                                                                           \n";
  Out << "  iN operator & (iN rhs) {                                                                 \n";
  Out << "    iN res;                                                                                \n";
  Out << "    int length = num_bits / (sizeof (int) * 8);                                            \n";
  Out << "    for (int i = 0; i < length; i++)                                                       \n";
  Out << "      res.num[i] = (this->num[i]) & (rhs.num[i]);                                          \n";
  Out << "    return res;                                                                            \n";
  Out << "  }                                                                                        \n";
  Out << "                                                                                           \n";
  Out << "  operator uint32_t() { return this->num[0]; }                                             \n";
  Out << "                                                                                           \n";
  Out << "  template <class T>                                                                       \n";
  Out << "  friend iN<num_bits> __cast_bits(iN<num_bits> to, T from) {                               \n";
  Out << "    for (int i = 0; i <" << vectorWidth << "; i++)                                         \n";
  Out << "      to.num[i] = ((int*)(&from))[i];                                                      \n";
  Out << "    return to;                                                                             \n";
  Out << "  }                                                                                        \n";
  Out << "                                                                                           \n";
  Out << "  template <class T>                                                                       \n";
  Out << "  friend T __cast_bits(T to, iN<num_bits> from) {                                          \n";
  Out << "    for (int i = 0; i <" << vectorWidth << "; i++)                                         \n";
  Out << "      ((int*)(&to))[i] = from.num[i];                                                      \n";
  Out << "    return to;                                                                             \n";
  Out << "  }                                                                                        \n";
  Out << "                                                                                           \n";
  Out << "  template <int ALIGN, class T>                                                            \n";
  Out << "  friend void __store(T *p, iN<num_bits> val) {                                            \n";
  Out << "    for (int i = 0; i <" << vectorWidth << "; i++)                                         \n";
  Out << "      ((int*)p)[i] = val.num[i];                                                           \n";
  Out << "  }                                                                                        \n";
  Out << "};                                                                                         \n";
  Out << "};\n";
  Out << "\n";

  // Get all of the struct types used in the module.
  std::vector<llvm::StructType*> StructTypes;
  llvm::TypeFinder typeFinder;
  typeFinder.run(*TheModule, false);
  for (llvm::TypeFinder::iterator iter = typeFinder.begin();
       iter != typeFinder.end(); ++iter)
      StructTypes.push_back(*iter);

  // Get all of the array types used in the module
  std::vector<llvm::ArrayType*> ArrayTypes;
  std::vector<llvm::IntegerType*> IntegerTypes;
  std::vector<bool> IsVolatile;
  std::vector<int>  Alignment;

  findUsedArrayAndLongIntTypes(TheModule, ArrayTypes, IntegerTypes, IsVolatile, Alignment);

  if (StructTypes.empty() && ArrayTypes.empty())
      return;

  Out << "/* Structure and array forward declarations */\n";

  unsigned NextTypeID = 0;

  // If any of them are missing names, add a unique ID to UnnamedStructIDs.
  // Print out forward declarations for structure types.
  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
    llvm::StructType *ST = StructTypes[i];

    if (ST->isLiteral() || ST->getName().empty())
      UnnamedStructIDs[ST] = NextTypeID++;

    std::string Name = getStructName(ST);

    Out << "struct " << Name << ";\n";
  }

  Out << "namespace {\n";
  for (unsigned i = 0, e = ArrayTypes.size(); i != e; ++i) {
      llvm::ArrayType *AT = ArrayTypes[i];
      ArrayIDs[AT] = NextTypeID++;
      std::string Name = getArrayName(AT);
      Out << "  struct " << Name << ";\n";
  }
  Out << "};\n";

  for (unsigned i = 0, e = IntegerTypes.size(); i != e; ++i) {
     llvm::IntegerType *IT = IntegerTypes[i];
      if (IT->getIntegerBitWidth() <= 64 || Alignment[i] == 0)
        continue;

      Out << "typedef struct __attribute__ ((packed, aligned(" << Alignment[i] << "))) {\n  ";
      IsVolatile[i] ? Out << "  volatile " : Out << "  ";
      printType(Out, IT, false, "data");
      Out << ";\n";
      Out << "} iN_" << IT->getIntegerBitWidth() << "_align_" << Alignment[i] << ";\n";
  }

  Out << '\n';

  // Keep track of which types have been printed so far.
  llvm::SmallPtrSet<llvm::Type *, 16> StructArrayPrinted;

  // Loop over all structures then push them into the stack so they are
  // printed in the correct order.
  //
  Out << "/* Structure and array contents */\n";
  for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
    if (StructTypes[i]->isStructTy())
      // Only print out used types!
      printContainedStructs(StructTypes[i], StructArrayPrinted);
  }

  Out << "namespace {\n";
  for (unsigned i = 0, e = ArrayTypes.size(); i != e; ++i)
    printContainedArrays(ArrayTypes[i], StructArrayPrinted);

  Out << "};\n";
  Out << '\n';
}

// Push the struct onto the stack and recursively push all structs
// this one depends on.
//
// TODO:  Make this work properly with vector types
//
void CWriter::printContainedStructs(llvm::Type *Ty,
                                    llvm::SmallPtrSet<llvm::Type *, 16> &Printed) {
  // Don't walk through pointers.
  if (!(Ty->isStructTy() || Ty->isArrayTy()))
    return;

  // Print all contained types first.
  for (llvm::Type::subtype_iterator I = Ty->subtype_begin(),
       E = Ty->subtype_end(); I != E; ++I)
    printContainedStructs(*I, Printed);

  if (llvm::StructType *ST = llvm::dyn_cast<llvm::StructType>(Ty)) {
    // Check to see if we have already printed this struct.
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6 // LLVM 3.6+
    if (!Printed.insert(Ty).second) return;
#else
    if (!Printed.insert(Ty)) return;
#endif

    // Print structure type out.
    printType(Out, ST, false, getStructName(ST), true);
    Out << ";\n\n";
  }
  if (llvm::ArrayType *AT = llvm::dyn_cast<llvm::ArrayType>(Ty)) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6 // LLVM 3.6+
      if (!Printed.insert(Ty).second) return;
#else
      if (!Printed.insert(Ty)) return;
#endif

      Out << "namespace {\n";
      printType(Out, AT, false, getArrayName(AT), true);
      Out << ";\n}\n\n";
  }
}

void CWriter::printContainedArrays(llvm::ArrayType *ATy,
                                   llvm::SmallPtrSet<llvm::Type *, 16> &Printed) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_6 // LLVM 3.6+
  if (!Printed.insert(ATy).second)
      return;
#else
  if (!Printed.insert(ATy))
      return;
#endif

  llvm::ArrayType *ChildTy = llvm::dyn_cast<llvm::ArrayType>(ATy->getElementType());
  if (ChildTy != NULL)
      printContainedArrays(ChildTy, Printed);

  printType(Out, ATy, false, getArrayName(ATy), true);
  Out << ";\n\n";
}

void CWriter::printFunctionSignature(const llvm::Function *F, bool Prototype) {
  /// isStructReturn - Should this function actually return a struct by-value?
  bool isStructReturn = F->hasStructRetAttr();

  if (F->hasLocalLinkage()) Out << "static ";
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
  if (F->hasDLLImportStorageClass()) Out << "__declspec(dllimport) ";
  if (F->hasDLLExportStorageClass()) Out << "__declspec(dllexport) ";
#else
  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";
#endif
  switch (F->getCallingConv()) {
   case llvm::CallingConv::X86_StdCall:
    Out << "__attribute__((stdcall)) ";
    break;
   case llvm::CallingConv::X86_FastCall:
    Out << "__attribute__((fastcall)) ";
    break;
   case llvm::CallingConv::X86_ThisCall:
    Out << "__attribute__((thiscall)) ";
    break;
   default:
    break;
  }

  // Loop over the arguments, printing them...
  llvm::FunctionType *FT = llvm::cast<llvm::FunctionType>(F->getFunctionType());
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  const llvm::AttrListPtr &PAL = F->getAttributes();
#else /* LLVM 3.3+ */
  const llvm::AttributeSet &PAL = F->getAttributes();
#endif

  std::string tstr;
  llvm::raw_string_ostream FunctionInnards(tstr);

  // Print out the name...
  FunctionInnards << GetValueName(F) << '(';

  bool PrintedArg = false;
  if (!F->isDeclaration()) {
    if (!F->arg_empty()) {
      llvm::Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
      unsigned Idx = 1;

      // If this is a struct-return function, don't print the hidden
      // struct-return argument.
      if (isStructReturn) {
        assert(I != E && "Invalid struct return function!");
        ++I;
        ++Idx;
      }

      std::string ArgName;
      for (; I != E; ++I) {
        if (PrintedArg) FunctionInnards << ", ";
        if (I->hasName() || !Prototype)
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
          ArgName = GetValueName(I);
#else /* LLVM 3.8+ */
          ArgName = GetValueName(&*I);
#endif
        else
          ArgName = "";
        llvm::Type *ArgTy = I->getType();
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
        if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
#else /* LLVM 3.3+ */
            if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
#endif
          ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
          ByValParams.insert(I);
#else /* LLVM 3.8+ */
          ByValParams.insert(&*I);
#endif
        }
        printType(FunctionInnards, ArgTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                  PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt),
#else /* LLVM 3.3+ */
                  PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt),
#endif
                  ArgName);
        PrintedArg = true;
        ++Idx;
      }
    }
  } else {
    // Loop over the arguments, printing them.
    llvm::FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
    unsigned Idx = 1;

    // If this is a struct-return function, don't print the hidden
    // struct-return argument.
    if (isStructReturn) {
      assert(I != E && "Invalid struct return function!");
      ++I;
      ++Idx;
    }

    for (; I != E; ++I) {
      if (PrintedArg) FunctionInnards << ", ";
      llvm::Type *ArgTy = *I;
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
      if (PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::ByVal)) {
#else /* LLVM 3.3+ */
          if (PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::ByVal)) {
#endif
        assert(ArgTy->isPointerTy());
        ArgTy = llvm::cast<llvm::PointerType>(ArgTy)->getElementType();
      }
      printType(FunctionInnards, ArgTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(Idx).hasAttribute(llvm::Attributes::SExt)
#else /* LLVM 3.3+ */
                PAL.getParamAttributes(Idx).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
#endif
                );
      PrintedArg = true;
      ++Idx;
    }
  }

  if (!PrintedArg && FT->isVarArg()) {
    FunctionInnards << "int vararg_dummy_arg";
    PrintedArg = true;
  }

  // Finish printing arguments... if this is a vararg function, print the ...,
  // unless there are no known types, in which case, we just emit ().
  //
  if (FT->isVarArg() && PrintedArg) {
    FunctionInnards << ",...";  // Output varargs portion of signature!
  } else if (!FT->isVarArg() && !PrintedArg) {
    FunctionInnards << "void"; // ret() -> ret(void) in C.
  }
  FunctionInnards << ')';

  // Get the return tpe for the function.
  llvm::Type *RetTy;
  if (!isStructReturn)
    RetTy = F->getReturnType();
  else {
    // If this is a struct-return function, print the struct-return type.
    RetTy = llvm::cast<llvm::PointerType>(FT->getParamType(0))->getElementType();
  }

  // Print out the return type and the signature built above.
  printType(Out, RetTy,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
            PAL.getParamAttributes(0).hasAttribute(llvm::Attributes::SExt),
#else /* LLVM 3.3+ */
            PAL.getParamAttributes(0).hasAttribute(llvm::AttributeSet::ReturnIndex, llvm::Attribute::SExt),
#endif
            FunctionInnards.str());
}

static inline bool isFPIntBitCast(const llvm::Instruction &I) {
  if (!llvm::isa<llvm::BitCastInst>(I))
    return false;
  llvm::Type *SrcTy = I.getOperand(0)->getType();
  llvm::Type *DstTy = I.getType();
  return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
         (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
}

void CWriter::printFunction(llvm::Function &F) {
  /// isStructReturn - Should this function actually return a struct by-value?
  bool isStructReturn = F.hasStructRetAttr();

  printFunctionSignature(&F, false);
  Out << " {\n";

  // If this is a struct return function, handle the result with magic.
  if (isStructReturn) {
    llvm::Type *StructTy =
      llvm::cast<llvm::PointerType>(F.arg_begin()->getType())->getElementType();
    Out << "  ";
    printType(Out, StructTy, false, "StructReturn");
    Out << ";  /* Struct return temporary */\n";

    Out << "  ";
    printType(Out, F.arg_begin()->getType(), false,
              GetValueName(&*(F.arg_begin())));
    Out << " = &StructReturn;\n";
  }

  bool PrintedVar = false;

  // print local variable information for the function
  for (llvm::inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
      if (const llvm::AllocaInst *AI = isDirectAlloca(&*I)) {
      Out << "  ";
      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
      Out << ";    /* Address-exposed local */\n";
      PrintedVar = true;
    } else if (I->getType() != llvm::Type::getVoidTy(F.getContext()) &&
               !isInlinableInst(*I)) {
      Out << "  ";
      printType(Out, I->getType(), false, GetValueName(&*I));
      Out << ";\n";

      if (llvm::isa<llvm::PHINode>(*I)) {  // Print out PHI node temporaries as well...
        Out << "  ";
        printType(Out, I->getType(), false,
                  GetValueName(&*I)+"__PHI");
        Out << ";\n";
      }
      PrintedVar = true;
    }
    // We need a temporary for the BitCast to use so it can pluck a value out
    // of a union to do the BitCast. This is separate from the need for a
    // variable to hold the result of the BitCast.
    if (isFPIntBitCast(*I)) {
      Out << "  llvmBitCastUnion " << GetValueName(&*I)
          << "__BITCAST_TEMPORARY;\n";
      PrintedVar = true;
    }
  }

  if (PrintedVar)
    Out << '\n';

  if (F.hasExternalLinkage() && F.getName() == "main")
    Out << "  CODE_FOR_MAIN();\n";

  // print the basic blocks
  for (llvm::Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
    if (llvm::Loop *L = LI->getLoopFor(&*BB)) {
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_9 // LLVM 3.9+
      if (L->getHeader()->getIterator() == BB && L->getParentLoop() == 0)
#else
      if (L->getHeader() == BB && L->getParentLoop() == 0)
#endif
        printLoop(L);
    } else {
      printBasicBlock(&*BB);
    }
  }

  Out << "}\n\n";
}

void CWriter::printLoop(llvm::Loop *L) {
  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
      << "' to make GCC happy */\n";
  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
    llvm::BasicBlock *BB = L->getBlocks()[i];
    llvm::Loop *BBLoop = LI->getLoopFor(BB);
    if (BBLoop == L)
      printBasicBlock(BB);
    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
      printLoop(BBLoop);
  }
  Out << "  } while (1); /* end of syntactic loop '"
      << L->getHeader()->getName() << "' */\n";
}

void CWriter::printBasicBlock(llvm::BasicBlock *BB) {

  // Don't print the label for the basic block if there are no uses, or if
  // the only terminator use is the predecessor basic block's terminator.
  // We have to scan the use list because PHI nodes use basic blocks too but
  // do not require a label to be generated.
  //
  bool NeedsLabel = false;
  for (llvm::pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
    if (isGotoCodeNecessary(*PI, BB)) {
      NeedsLabel = true;
      break;
    }

  if (NeedsLabel) Out << GetValueName(BB) << ": {\n";

  // Output all of the instructions in the basic block...
  for (llvm::BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
       ++II) {
    if (!isInlinableInst(*II) && !isDirectAlloca(&*II)) {
      if (II->getType() != llvm::Type::getVoidTy(BB->getContext()) &&
          !isInlineAsm(*II))
        outputLValue(&*II);
      else
        Out << "  ";
      writeInstComputationInline(*II);
      Out << ";\n";
    }
  }

  // Don't emit prefix or suffix for the terminator.
  visit(*BB->getTerminator());
  if (NeedsLabel) Out << "}\n"; // workaround g++ bug
}


// Specific Instruction type classes... note that all of the casts are
// necessary because we use the instruction classes as opaque types...
//
void CWriter::visitReturnInst(llvm::ReturnInst &I) {
  // If this is a struct return function, return the temporary struct.
  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();

  if (isStructReturn) {
    Out << "  return StructReturn;\n";
    return;
  }

  // Don't output a void return if this is the last basic block in the function
  if (I.getNumOperands() == 0 &&
      &*--I.getParent()->getParent()->end() == I.getParent() &&
      (!I.getParent()->size()) == 1) {
    return;
  }

  Out << "  return";
  if (I.getNumOperands()) {
    Out << ' ';
    writeOperand(I.getOperand(0));
  }
  Out << ";\n";
}

void CWriter::visitSwitchInst(llvm::SwitchInst &SI) {

  llvm::Value* Cond = SI.getCondition();

  Out << "  switch (";
  writeOperand(Cond);
  Out << ") {\n  default:\n";
  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
  Out << ";\n";

  for (llvm::SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
    llvm::ConstantInt* CaseVal = i.getCaseValue();
    llvm::BasicBlock* Succ = i.getCaseSuccessor();
    Out << "  case ";
    writeOperand(CaseVal);
    Out << ":\n";
    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
    printBranchToBlock(SI.getParent(), Succ, 2);

#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
    if (llvm::Function::iterator(Succ) == std::next(llvm::Function::iterator(SI.getParent())))
#else
    if (llvm::Function::iterator(Succ) == llvm::next(llvm::Function::iterator(SI.getParent())))
#endif
      Out << "    break;\n";
  }

  Out << "  }\n";
}

void CWriter::visitIndirectBrInst(llvm::IndirectBrInst &IBI) {
  Out << "  goto *(void*)(";
  writeOperand(IBI.getOperand(0));
  Out << ");\n";
}

void CWriter::visitUnreachableInst(llvm::UnreachableInst &I) {
  Out << "  /*UNREACHABLE*/;\n";
}

bool CWriter::isGotoCodeNecessary(llvm::BasicBlock *From, llvm::BasicBlock *To) {
  /// FIXME: This should be reenabled, but loop reordering safe!!
  return true;

#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
  if (std::next(llvm::Function::iterator(From)) != llvm::Function::iterator(To))
#else
  if (llvm::next(llvm::Function::iterator(From)) != llvm::Function::iterator(To))
#endif
    return true;  // Not the direct successor, we need a goto.

  //llvm::isa<llvm::SwitchInst>(From->getTerminator())

  if (LI->getLoopFor(From) != LI->getLoopFor(To))
    return true;
  return false;
}

void CWriter::printPHICopiesForSuccessor (llvm::BasicBlock *CurBlock,
                                          llvm::BasicBlock *Successor,
                                          unsigned Indent) {
  for (llvm::BasicBlock::iterator I = Successor->begin(); llvm::isa<llvm::PHINode>(I); ++I) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
    llvm::PHINode *PN = llvm::cast<llvm::PHINode>(I);
#else /* LLVM 3.8+ */
    llvm::PHINode *PN = llvm::cast<llvm::PHINode>(&*I);
#endif
    // Now we have to do the printing.
    llvm::Value *IV = PN->getIncomingValueForBlock(CurBlock);
    if (!llvm::isa<llvm::UndefValue>(IV)) {
      Out << std::string(Indent, ' ');
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
      Out << "  " << GetValueName(I) << "__PHI = ";
#else /* LLVM 3.8+ */
      Out << "  " << GetValueName(&*I) << "__PHI = ";
#endif
      writeOperand(IV);
      Out << ";   /* for PHI node */\n";
    }
  }
}

void CWriter::printBranchToBlock(llvm::BasicBlock *CurBB, llvm::BasicBlock *Succ,
                                 unsigned Indent) {
  if (isGotoCodeNecessary(CurBB, Succ)) {
    Out << std::string(Indent, ' ') << "  goto ";
    writeOperand(Succ);
    Out << ";\n";
  }
}

// Branch instruction printing - Avoid printing out a branch to a basic block
// that immediately succeeds the current one.
//
void CWriter::visitBranchInst(llvm::BranchInst &I) {

  if (I.isConditional()) {
    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
      Out << "  if (";
      writeOperand(I.getCondition());
      Out << ") {\n";

      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);

      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
        Out << "  } else {\n";
        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
      }
    } else {
      // First goto not necessary, assume second one is...
      Out << "  if (!";
      writeOperand(I.getCondition());
      Out << ") {\n";

      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
    }

    Out << "  }\n";
  } else {
    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
  }
  Out << "\n";
}

// PHI nodes get copied into temporary values at the end of predecessor basic
// blocks.  We now need to copy these temporary values into the REAL value for
// the PHI.
void CWriter::visitPHINode(llvm::PHINode &I) {
  writeOperand(&I);
  Out << "__PHI";
}


void CWriter::visitBinaryOperator(llvm::Instruction &I) {
  // binary instructions, shift instructions, setCond instructions.
  assert(!I.getType()->isPointerTy());

  if (llvm::isa<const llvm::VectorType>(I.getOperand(0)->getType())) {
      const char *intrinsic = NULL;
      switch (I.getOpcode()) {
      case llvm::Instruction::Add:  intrinsic = "__add";  break;
      case llvm::Instruction::FAdd: intrinsic = "__add";  break;
      case llvm::Instruction::Sub:  intrinsic = "__sub";  break;
      case llvm::Instruction::FSub: intrinsic = "__sub";  break;
      case llvm::Instruction::Mul:  intrinsic = "__mul";  break;
      case llvm::Instruction::FMul: intrinsic = "__mul";  break;
      case llvm::Instruction::URem: intrinsic = "__urem"; break;
      case llvm::Instruction::SRem: intrinsic = "__srem"; break;
      case llvm::Instruction::FRem: intrinsic = "__frem"; break;
      case llvm::Instruction::UDiv: intrinsic = "__udiv"; break;
      case llvm::Instruction::SDiv: intrinsic = "__sdiv"; break;
      case llvm::Instruction::FDiv: intrinsic = "__div";  break;
      case llvm::Instruction::And:  intrinsic = "__and";  break;
      case llvm::Instruction::Or:   intrinsic = "__or";   break;
      case llvm::Instruction::Xor:  intrinsic = "__xor";  break;
      case llvm::Instruction::Shl : intrinsic = "__shl";  break;
      case llvm::Instruction::LShr: intrinsic = "__lshr"; break;
      case llvm::Instruction::AShr: intrinsic = "__ashr"; break;
      default:
#ifndef NDEBUG
          llvm::errs() << "Invalid operator type!" << I;
#endif
          llvm_unreachable(0);
      }
      Out << intrinsic;
      Out << "(";
      writeOperand(I.getOperand(0));
      Out << ", ";
      if ((I.getOpcode() == llvm::Instruction::Shl ||
           I.getOpcode() == llvm::Instruction::LShr ||
           I.getOpcode() == llvm::Instruction::AShr)) {
          llvm::Value *splat = NULL;
          if (LLVMVectorValuesAllEqual(I.getOperand(1), &splat)) {
              if (splat) {
                  // Avoid __extract_element(splat(value), 0), if possible.
                  writeOperand(splat);
              } else {
                  Out << "__extract_element(";
                  writeOperand(I.getOperand(1));
                  Out << ", 0) ";
              }
          }
          else
              writeOperand(I.getOperand(1));
      }
      else
          writeOperand(I.getOperand(1));
      Out << ")";
      return;
  }

  // We must cast the results of binary operations which might be promoted.
  bool needsCast = false;
  if ((I.getType() == llvm::Type::getInt8Ty(I.getContext())) ||
      (I.getType() == llvm::Type::getInt16Ty(I.getContext()))
      || (I.getType() == llvm::Type::getFloatTy(I.getContext()))) {
    needsCast = true;
    Out << "((";
    printType(Out, I.getType(), false);
    Out << ")(";
  }

  // If this is a negation operation, print it out as such.  For FP, we don't
  // want to print "-0.0 - X".
  if (llvm::BinaryOperator::isNeg(&I)) {
    Out << "-(";
    writeOperand(llvm::BinaryOperator::getNegArgument(llvm::cast<llvm::BinaryOperator>(&I)));
    Out << ")";
  } else if (llvm::BinaryOperator::isFNeg(&I)) {
    Out << "-(";
    writeOperand(llvm::BinaryOperator::getFNegArgument(llvm::cast<llvm::BinaryOperator>(&I)));
    Out << ")";
  } else if (I.getOpcode() == llvm::Instruction::FRem) {
    // Output a call to fmod/fmodf instead of emitting a%b
    if (I.getType() == llvm::Type::getFloatTy(I.getContext()))
      Out << "fmodf(";
    else if (I.getType() == llvm::Type::getDoubleTy(I.getContext()))
      Out << "fmod(";
    else  // all 3 flavors of long double
      Out << "fmodl(";
    writeOperand(I.getOperand(0));
    Out << ", ";
    writeOperand(I.getOperand(1));
    Out << ")";
  } else {

    // Write out the cast of the instruction's value back to the proper type
    // if necessary.
    bool NeedsClosingParens = writeInstructionCast(I);

    // Certain instructions require the operand to be forced to a specific type
    // so we use writeOperandWithCast here instead of writeOperand. Similarly
    // below for operand 1
    writeOperandWithCast(I.getOperand(0), I.getOpcode());

    switch (I.getOpcode()) {
    case llvm::Instruction::Add:
    case llvm::Instruction::FAdd: Out << " + "; break;
    case llvm::Instruction::Sub:
    case llvm::Instruction::FSub: Out << " - "; break;
    case llvm::Instruction::Mul:
    case llvm::Instruction::FMul: Out << " * "; break;
    case llvm::Instruction::URem:
    case llvm::Instruction::SRem:
    case llvm::Instruction::FRem: Out << " % "; break;
    case llvm::Instruction::UDiv:
    case llvm::Instruction::SDiv:
    case llvm::Instruction::FDiv: Out << " / "; break;
    case llvm::Instruction::And:  Out << " & "; break;
    case llvm::Instruction::Or:   Out << " | "; break;
    case llvm::Instruction::Xor:  Out << " ^ "; break;
    case llvm::Instruction::Shl : Out << " << "; break;
    case llvm::Instruction::LShr:
    case llvm::Instruction::AShr: Out << " >> "; break;
    default:
#ifndef NDEBUG
       llvm::errs() << "Invalid operator type!" << I;
#endif
       llvm_unreachable(0);
    }

    writeOperandWithCast(I.getOperand(1), I.getOpcode());
    if (NeedsClosingParens)
      Out << "))";
  }

  if (needsCast) {
    Out << "))";
  }
}


static const char *
lPredicateToString(llvm::CmpInst::Predicate p) {
    switch (p) {
    case llvm::ICmpInst::ICMP_EQ:  return "__equal";
    case llvm::ICmpInst::ICMP_NE:  return "__not_equal";
    case llvm::ICmpInst::ICMP_ULE: return "__unsigned_less_equal";
    case llvm::ICmpInst::ICMP_SLE: return "__signed_less_equal";
    case llvm::ICmpInst::ICMP_UGE: return "__unsigned_greater_equal";
    case llvm::ICmpInst::ICMP_SGE: return "__signed_greater_equal";
    case llvm::ICmpInst::ICMP_ULT: return "__unsigned_less_than";
    case llvm::ICmpInst::ICMP_SLT: return "__signed_less_than";
    case llvm::ICmpInst::ICMP_UGT: return "__unsigned_greater_than";
    case llvm::ICmpInst::ICMP_SGT: return "__signed_greater_than";

    case llvm::FCmpInst::FCMP_ORD: return "__ordered";
    case llvm::FCmpInst::FCMP_UNO: return "__unordered";
    case llvm::FCmpInst::FCMP_UEQ: return "__equal";
    case llvm::FCmpInst::FCMP_UNE: return "__not_equal";
    case llvm::FCmpInst::FCMP_ULT: return "__less_than";
    case llvm::FCmpInst::FCMP_ULE: return "__less_equal";
    case llvm::FCmpInst::FCMP_UGT: return "__greater_than";
    case llvm::FCmpInst::FCMP_UGE: return "__greater_equal";
    case llvm::FCmpInst::FCMP_OEQ: return "__equal";
    case llvm::FCmpInst::FCMP_ONE: return "__not_equal";
    case llvm::FCmpInst::FCMP_OLT: return "__less_than";
    case llvm::FCmpInst::FCMP_OLE: return "__less_equal";
    case llvm::FCmpInst::FCMP_OGT: return "__greater_than";
    case llvm::FCmpInst::FCMP_OGE: return "__greater_equal";

    default: llvm_unreachable(0); return NULL;
    }
}


static const char *
lTypeToSuffix(llvm::Type *t) {
    llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(t);
    Assert(vt != NULL);
    t = vt->getElementType();

    switch (t->getTypeID()) {
    case llvm::Type::FloatTyID:  return "float";
    case llvm::Type::DoubleTyID: return "double";
    case llvm::Type::IntegerTyID: {
        switch (llvm::cast<llvm::IntegerType>(t)->getBitWidth()) {
        case 1:  return "i1";
        case 8:  return "i8";
        case 16: return "i16";
        case 32: return "i32";
        case 64: return "i64";
        }
    }
    default: llvm_unreachable(0); return NULL;
    }
    return NULL;
}


void CWriter::visitICmpInst(llvm::ICmpInst &I) {
  bool isVector = llvm::isa<llvm::VectorType>(I.getOperand(0)->getType());

  if (isVector) {
      Out << lPredicateToString(I.getPredicate());
      Out << "_";
      Out << lTypeToSuffix(I.getOperand(0)->getType());
      Out << "(";
      writeOperand(I.getOperand(0));
      Out << ", ";
      writeOperand(I.getOperand(1));
      Out << ")";
      return;
  }

  // Write out the cast of the instruction's value back to the proper type
  // if necessary.
  bool NeedsClosingParens = writeInstructionCast(I);

  // Certain icmp predicate require the operand to be forced to a specific type
  // so we use writeOperandWithCast here instead of writeOperand. Similarly
  // below for operand 1
  writeOperandWithCast(I.getOperand(0), I);

  switch (I.getPredicate()) {
  case llvm::ICmpInst::ICMP_EQ:  Out << " == "; break;
  case llvm::ICmpInst::ICMP_NE:  Out << " != "; break;
  case llvm::ICmpInst::ICMP_ULE:
  case llvm::ICmpInst::ICMP_SLE: Out << " <= "; break;
  case llvm::ICmpInst::ICMP_UGE:
  case llvm::ICmpInst::ICMP_SGE: Out << " >= "; break;
  case llvm::ICmpInst::ICMP_ULT:
  case llvm::ICmpInst::ICMP_SLT: Out << " < "; break;
  case llvm::ICmpInst::ICMP_UGT:
  case llvm::ICmpInst::ICMP_SGT: Out << " > "; break;
  default:
#ifndef NDEBUG
    llvm::errs() << "Invalid icmp predicate!" << I;
#endif
    llvm_unreachable(0);
  }

  writeOperandWithCast(I.getOperand(1), I);
  if (NeedsClosingParens)
    Out << "))";
}

void CWriter::visitFCmpInst(llvm::FCmpInst &I) {
  bool isVector = llvm::isa<llvm::VectorType>(I.getOperand(0)->getType());

  if (I.getPredicate() == llvm::FCmpInst::FCMP_FALSE) {
    if (isVector)
      llvm::report_fatal_error("FIXME: vector FCMP_FALSE");
    else
      Out << "0";
    return;
  }
  if (I.getPredicate() == llvm::FCmpInst::FCMP_TRUE) {
    if (isVector)
      llvm::report_fatal_error("FIXME: vector FCMP_TRUE");
    else
      Out << "1";
    return;
  }

  if (isVector) {
      Out << lPredicateToString(I.getPredicate());
      Out << "_";
      Out << lTypeToSuffix(I.getOperand(0)->getType());
      Out << "(";
  }
  else {
  const char* op = 0;
  switch (I.getPredicate()) {
  default: llvm_unreachable("Illegal FCmp predicate");
  case llvm::FCmpInst::FCMP_ORD: op = "ord"; break;
  case llvm::FCmpInst::FCMP_UNO: op = "uno"; break;

  case llvm::FCmpInst::FCMP_UEQ: op = "ueq"; break;
  case llvm::FCmpInst::FCMP_UNE: op = "une"; break;
  case llvm::FCmpInst::FCMP_ULT: op = "ult"; break;
  case llvm::FCmpInst::FCMP_ULE: op = "ule"; break;
  case llvm::FCmpInst::FCMP_UGT: op = "ugt"; break;
  case llvm::FCmpInst::FCMP_UGE: op = "uge"; break;

  case llvm::FCmpInst::FCMP_OEQ: op = "oeq"; break;
  case llvm::FCmpInst::FCMP_ONE: op = "one"; break;
  case llvm::FCmpInst::FCMP_OLT: op = "olt"; break;
  case llvm::FCmpInst::FCMP_OLE: op = "ole"; break;
  case llvm::FCmpInst::FCMP_OGT: op = "ogt"; break;
  case llvm::FCmpInst::FCMP_OGE: op = "oge"; break;
  }

  Out << "llvm_fcmp_" << op << "(";
  }

  // Write the first operand
  writeOperand(I.getOperand(0));
  Out << ", ";
  // Write the second operand
  writeOperand(I.getOperand(1));
  Out << ")";
}

static const char * getFloatBitCastField(llvm::Type *Ty) {
  switch (Ty->getTypeID()) {
    default: llvm_unreachable("Invalid Type");
    case llvm::Type::FloatTyID:  return "Float";
    case llvm::Type::DoubleTyID: return "Double";
    case llvm::Type::IntegerTyID: {
        unsigned NumBits = llvm::cast<llvm::IntegerType>(Ty)->getBitWidth();
      if (NumBits <= 32)
        return "Int32";
      else
        return "Int64";
    }
  }
}

void CWriter::visitCastInst(llvm::CastInst &I) {
  llvm::Type *DstTy = I.getType();
  llvm::Type *SrcTy = I.getOperand(0)->getType();
  if (isFPIntBitCast(I)) {
    Out << '(';
    // These int<->float and long<->double casts need to be handled specially
    Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
    writeOperand(I.getOperand(0));
    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
        << getFloatBitCastField(I.getType());
    Out << ')';
    return;
  }

  if ((llvm::isa<llvm::VectorType>(DstTy)) && (!llvm::isa<llvm::VectorType>(SrcTy))) {
    writeOperand(I.getOperand(0));
    return;
  }

  Out << '(';
  bool closeParen = printCast(I.getOpcode(), SrcTy, DstTy);

  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
  if (SrcTy == llvm::Type::getInt1Ty(I.getContext()) &&
      I.getOpcode() == llvm::Instruction::SExt)
    Out << "0-";

  writeOperand(I.getOperand(0));

  if (DstTy == llvm::Type::getInt1Ty(I.getContext()) &&
      (I.getOpcode() == llvm::Instruction::Trunc ||
       I.getOpcode() == llvm::Instruction::FPToUI ||
       I.getOpcode() == llvm::Instruction::FPToSI ||
       I.getOpcode() == llvm::Instruction::PtrToInt)) {
    // Make sure we really get a trunc to bool by anding the operand with 1
    Out << "&1u";
  }
  Out << ')';
  if (closeParen)
      Out << ')';
}

void CWriter::visitSelectInst(llvm::SelectInst &I) {
  if (llvm::isa<llvm::VectorType>(I.getType())) {
      Out << "__select(";
      writeOperand(I.getCondition());
      Out << ", ";
      writeOperand(I.getTrueValue());
      Out << ", ";
      writeOperand(I.getFalseValue());
      Out << ")";
      return;
  }

  Out << "((";
  writeOperand(I.getCondition());
  Out << ") ? (";
  writeOperand(I.getTrueValue());
  Out << ") : (";
  writeOperand(I.getFalseValue());
  Out << "))";
}

// Returns the macro name or value of the max or min of an integer type
// (as defined in limits.h).
static void printLimitValue(llvm::IntegerType &Ty, bool isSigned, bool isMax,
                            llvm::raw_ostream &Out) {
  const char* type;
  const char* sprefix = "";

  unsigned NumBits = Ty.getBitWidth();
  if (NumBits <= 8) {
    type = "CHAR";
    sprefix = "S";
  } else if (NumBits <= 16) {
    type = "SHRT";
  } else if (NumBits <= 32) {
    type = "INT";
  } else if (NumBits <= 64) {
    type = "LLONG";
  } else {
    llvm_unreachable("Bit widths > 64 not implemented yet");
  }

  if (isSigned)
    Out << sprefix << type << (isMax ? "_MAX" : "_MIN");
  else
    Out << "U" << type << (isMax ? "_MAX" : "0");
}

#ifndef NDEBUG
static bool isSupportedIntegerSize(llvm::IntegerType &T) {
  return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
         T.getBitWidth() == 32 || T.getBitWidth() == 64;
}
#endif

void CWriter::printIntrinsicDefinition(const llvm::Function &F, llvm::raw_ostream &Out) {
  llvm::FunctionType *funT = F.getFunctionType();
  llvm::Type *retT = F.getReturnType();
  llvm::IntegerType *elemT = llvm::cast<llvm::IntegerType>(funT->getParamType(1));

  assert(isSupportedIntegerSize(*elemT) &&
         "CBackend does not support arbitrary size integers.");
  assert(llvm::cast<llvm::StructType>(retT)->getElementType(0) == elemT &&
         elemT == funT->getParamType(0) && funT->getNumParams() == 2);

  switch (F.getIntrinsicID()) {
  default:
    llvm_unreachable("Unsupported Intrinsic.");
  case llvm::Intrinsic::uadd_with_overflow:
    // static inline Rty uadd_ixx(unsigned ixx a, unsigned ixx b) {
    //   Rty r;
    //   r.field0 = a + b;
    //   r.field1 = (r.field0 < a);
    //   return r;
    // }
    Out << "static inline ";
    printType(Out, retT);
    Out << GetValueName(&F);
    Out << "(";
    printSimpleType(Out, elemT, false);
    Out << "a,";
    printSimpleType(Out, elemT, false);
    Out << "b) {\n  ";
    printType(Out, retT);
    Out << "r;\n";
    Out << "  r.field0 = a + b;\n";
    Out << "  r.field1 = (r.field0 < a);\n";
    Out << "  return r;\n}\n";
    break;

  case llvm::Intrinsic::sadd_with_overflow:
    // static inline Rty sadd_ixx(ixx a, ixx b) {
    //   Rty r;
    //   r.field1 = (b > 0 && a > XX_MAX - b) ||
    //              (b < 0 && a < XX_MIN - b);
    //   r.field0 = r.field1 ? 0 : a + b;
    //   return r;
    // }
    Out << "static ";
    printType(Out, retT);
    Out << GetValueName(&F);
    Out << "(";
    printSimpleType(Out, elemT, true);
    Out << "a,";
    printSimpleType(Out, elemT, true);
    Out << "b) {\n  ";
    printType(Out, retT);
    Out << "r;\n";
    Out << "  r.field1 = (b > 0 && a > ";
    printLimitValue(*elemT, true, true, Out);
    Out << " - b) || (b < 0 && a < ";
    printLimitValue(*elemT, true, false, Out);
    Out << " - b);\n";
    Out << "  r.field0 = r.field1 ? 0 : a + b;\n";
    Out << "  return r;\n}\n";
    break;

  case llvm::Intrinsic::umul_with_overflow:
    Out << "static inline ";
    printType(Out, retT);
    Out << GetValueName(&F);
    Out << "(";
    printSimpleType(Out, elemT, false);
    Out << "a,";
    printSimpleType(Out, elemT, false);
    Out << "b) {\n  ";

    printType(Out, retT);
    Out << "r;\n";

    unsigned NumBits = llvm::cast<llvm::IntegerType>(elemT)->getBitWidth();
    std::stringstream  str_type;
    if (NumBits <= 32)
      str_type << "uint" << 2 * NumBits << "_t";
    else {
      assert(NumBits <= 64 && "Bit widths > 128 not implemented yet");
      str_type << "llvmUInt128";
    }

    Out << "  " << str_type.str() << " result = (" << str_type.str() << ") a * (" << str_type.str() << ") b;\n";
    Out << "  r.field0 = result;\n";
    Out << "  r.field1 = result >> " << NumBits << ";\n";
    Out << "  return r;\n}\n";
    break;
  }
}

void CWriter::lowerIntrinsics(llvm::Function &F) {
  // This is used to keep track of intrinsics that get generated to a lowered
  // function. We must generate the prototypes before the function body which
  // will only be expanded on first use (by the loop below).
  std::vector<llvm::Function*> prototypesToGen;

  // Examine all the instructions in this function to find the intrinsics that
  // need to be lowered.
  for (llvm::Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
    for (llvm::BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
      if (llvm::CallInst *CI = llvm::dyn_cast<llvm::CallInst>(I++))
        if (llvm::Function *F = CI->getCalledFunction())
          switch (F->getIntrinsicID()) {
          // We directly implement these intrinsics
          case llvm::Intrinsic::not_intrinsic:
          case llvm::Intrinsic::vastart:
          case llvm::Intrinsic::vacopy:
          case llvm::Intrinsic::vaend:
          case llvm::Intrinsic::returnaddress:
          case llvm::Intrinsic::frameaddress:
          case llvm::Intrinsic::setjmp:
          case llvm::Intrinsic::longjmp:
          case llvm::Intrinsic::memset:
          case llvm::Intrinsic::prefetch:
          case llvm::Intrinsic::powi:
          case llvm::Intrinsic::fabs:
          case llvm::Intrinsic::x86_sse_cmp_ss:
          case llvm::Intrinsic::x86_sse_cmp_ps:
          case llvm::Intrinsic::x86_sse2_cmp_sd:
          case llvm::Intrinsic::x86_sse2_cmp_pd:
          case llvm::Intrinsic::ppc_altivec_lvsl:
          case llvm::Intrinsic::uadd_with_overflow:
          case llvm::Intrinsic::sadd_with_overflow:
          case llvm::Intrinsic::trap:
          case llvm::Intrinsic::objectsize:
          case llvm::Intrinsic::readcyclecounter:
          case llvm::Intrinsic::umul_with_overflow:
          // Or we just ignore them because of their uselessness in C++ source
          case llvm::Intrinsic::dbg_value:
          case llvm::Intrinsic::dbg_declare:
            break;
          default:
            // If this is an intrinsic that directly corresponds to a GCC
            // builtin, we handle it.
            const char *BuiltinName = "";
#define GET_GCC_BUILTIN_NAME
#define Intrinsic llvm::Intrinsic
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  #include "llvm/Intrinsics.gen"
#else /* LLVM 3.3+ */
  #include "llvm/IR/Intrinsics.gen"
#endif
#undef Intrinsic
#undef GET_GCC_BUILTIN_NAME
            // If we handle it, don't lower it.
            if (BuiltinName[0]) break;

            // All other intrinsic calls we must lower.
            llvm::Instruction *Before = 0;
            if (CI != &BB->front())
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
              Before = &*std::prev(llvm::BasicBlock::iterator(CI));
#else
              Before = prior(llvm::BasicBlock::iterator(CI));
#endif

            IL->LowerIntrinsicCall(CI);
            if (Before) {        // Move iterator to instruction after call
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_7 /* 3.2, 3.3, 3.4, 3.5, 3.6, 3.7 */
              I = Before; ++I;
#else /* LLVM 3.8+ */
              I = Before->getIterator(); ++I;
#endif
            } else {
              I = BB->begin();
            }
            // If the intrinsic got lowered to another call, and that call has
            // a definition then we need to make sure its prototype is emitted
            // before any calls to it.
            if (llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(I))
              if (llvm::Function *NewF = Call->getCalledFunction())
                if (!NewF->isDeclaration())
                  prototypesToGen.push_back(NewF);

            break;
          }

  // We may have collected some prototypes to emit in the loop above.
  // Emit them now, before the function that uses them is emitted. But,
  // be careful not to emit them twice.
  std::vector<llvm::Function*>::iterator I = prototypesToGen.begin();
  std::vector<llvm::Function*>::iterator E = prototypesToGen.end();
  for ( ; I != E; ++I) {
    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
      Out << '\n';
      printFunctionSignature(*I, true);
      Out << ";\n";
    }
  }
}

void CWriter::visitCallInst(llvm::CallInst &I) {
  if (llvm::isa<llvm::InlineAsm>(I.getCalledValue()))
    return visitInlineAsm(I);

  bool WroteCallee = false;

  // Handle intrinsic function calls first...
  if (llvm::Function *F = I.getCalledFunction())
    if (llvm::Intrinsic::ID ID = (llvm::Intrinsic::ID)F->getIntrinsicID())
      if (visitBuiltinCall(I, ID, WroteCallee))
        return;

  llvm::Value *Callee = I.getCalledValue();

  llvm::PointerType  *PTy   = llvm::cast<llvm::PointerType>(Callee->getType());
  llvm::FunctionType *FTy   = llvm::cast<llvm::FunctionType>(PTy->getElementType());

  // If this is a call to a struct-return function, assign to the first
  // parameter instead of passing it to the call.
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  const llvm::AttrListPtr &PAL = I.getAttributes();
#else /* LLVM 3.3+ */
  const llvm::AttributeSet &PAL = I.getAttributes();
#endif

  bool hasByVal = I.hasByValArgument();
  bool isStructRet = (I.getNumArgOperands() > 0) && I.hasStructRetAttr();
  if (isStructRet) {
    writeOperandDeref(I.getArgOperand(0));
    Out << " = ";
  }

  if (I.isTailCall()) Out << " /*tail*/ ";

  if (!WroteCallee) {
    // If this is an indirect call to a struct return function, we need to cast
    // the pointer. Ditto for indirect calls with byval arguments.
    bool NeedsCast = (hasByVal || isStructRet) && !llvm::isa<llvm::Function>(Callee);

    // GCC is a real PITA.  It does not permit codegening casts of functions to
    // function pointers if they are in a call (it generates a trap instruction
    // instead!).  We work around this by inserting a cast to void* in between
    // the function and the function pointer cast.  Unfortunately, we can't just
    // form the constant expression here, because the folder will immediately
    // nuke it.
    //
    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
    // that void* and function pointers have the same size. :( To deal with this
    // in the common case, we handle casts where the number of arguments passed
    // match exactly.
    //
    if (llvm::ConstantExpr *CE = llvm::dyn_cast<llvm::ConstantExpr>(Callee))
      if (CE->isCast())
        if (llvm::Function *RF = llvm::dyn_cast<llvm::Function>(CE->getOperand(0))) {
          NeedsCast = true;
          Callee = RF;
        }

    if (Callee->getName() == "malloc" ||
        Callee->getName() == "_aligned_malloc")
        Out << "(uint8_t *)";

    // This 'if' will fix 'soa-18.ispc' test (fails with optimizations off)
    // Yet the way the case is fixed is quite dirty and leads to many other fails

    //if (Callee->getName() == "__masked_store_i64") {
    //    llvm::CallSite CS(&I);
    //    llvm::CallSite::arg_iterator AI = CS.arg_begin();
    //    if (is_vec16_i64_ty(llvm::cast<llvm::PointerType>((*AI)->getType())->getElementType())) {
    //        Out << "/* Replacing store of vec16_i64 val into &vec16_i64 pointer with a simple copy */\n";
    //        // If we are trying to get a pointer to from a vec16_i64 var
    //        // It would be better to replace this instruction with a masked copy
    //        if (llvm::isa<llvm::GetElementPtrInst>(*AI)) {
    //            writeOperandDeref(*AI);
    //            Out << " = __select(";
    //            writeOperand(*(AI+2));
    //            Out << ", ";
    //            writeOperand(*(AI+1));
    //            Out << ", ";
    //            writeOperandDeref(*AI);
    //            Out << ")";
    //            return;
    //        }
    //    }
    //}

    if (NeedsCast) {
      // Ok, just cast the pointer type.
      Out << "((";
      if (isStructRet)
        printStructReturnPointerFunctionType(Out, PAL,
                             llvm::cast<llvm::PointerType>(I.getCalledValue()->getType()));
      else if (hasByVal)
        printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
      else
        printType(Out, I.getCalledValue()->getType());
      Out << ")(void*)";
    }
    writeOperand(Callee);
    if (NeedsCast) Out << ')';
  }

  Out << '(';

  bool PrintedArg = false;
  if(FTy->isVarArg() && !FTy->getNumParams()) {
    Out << "0 /*dummy arg*/";
    PrintedArg = true;
  }

  unsigned NumDeclaredParams = FTy->getNumParams();
  llvm::CallSite CS(&I);
  llvm::CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
  unsigned ArgNo = 0;
  if (isStructRet) {   // Skip struct return argument.
    ++AI;
    ++ArgNo;
  }


  for (; AI != AE; ++AI, ++ArgNo) {
    if (PrintedArg) Out << ", ";
    if (ArgNo == 0 &&
        Callee->getName() == "posix_memalign") {
        // uint8_t** is incompatible with void** without explicit cast.
        // Should be do this any other functions?
        Out << "(void **)";
    }
    else if (ArgNo < NumDeclaredParams &&
        (*AI)->getType() != FTy->getParamType(ArgNo)) {
      Out << '(';
      printType(Out, FTy->getParamType(ArgNo),
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::Attributes::SExt)
#else /* LLVM 3.3+ */
                PAL.getParamAttributes(ArgNo+1).hasAttribute(llvm::AttributeSet::FunctionIndex, llvm::Attribute::SExt)
#endif
                );
      Out << ')';
    }
    // Check if the argument is expected to be passed by value.
    if (I.paramHasAttr(ArgNo+1,
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                       llvm::Attributes::ByVal
#else /* LLVM 3.3+ */
                       llvm::Attribute::ByVal
#endif
                       )) {
      writeOperandDeref(*AI);
    }
    else {
      writeOperand(*AI);
    }
    PrintedArg = true;
  }
  Out << ')';
}

/// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
/// if the entire call is handled, return false if it wasn't handled, and
/// optionally set 'WroteCallee' if the callee has already been printed out.
bool CWriter::visitBuiltinCall(llvm::CallInst &I, llvm::Intrinsic::ID ID,
                               bool &WroteCallee) {
  switch (ID) {
  default: {
    // If this is an intrinsic that directly corresponds to a GCC
    // builtin, we emit it here.
    const char *BuiltinName = "";
#define GET_GCC_BUILTIN_NAME
#define Intrinsic llvm::Intrinsic
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
  #include "llvm/Intrinsics.gen"
#else /* LLVM 3.3+ */
  #include "llvm/IR/Intrinsics.gen"
#endif
#undef Intrinsic
#undef GET_GCC_BUILTIN_NAME
    assert(BuiltinName[0] && "Unknown LLVM intrinsic!");

    Out << BuiltinName;
    WroteCallee = true;
    return false;
  }
  // Ignoring debug intrinsics
  case llvm::Intrinsic::dbg_value:
  case llvm::Intrinsic::dbg_declare:
    return true;
  case llvm::Intrinsic::vastart:
    Out << "0; ";

    Out << "va_start(*(va_list*)";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    // Output the last argument to the enclosing function.
    if (I.getParent()->getParent()->arg_empty())
      Out << "vararg_dummy_arg";
    else
      writeOperand(&*(--I.getParent()->getParent()->arg_end()));
    Out << ')';
    return true;
  case llvm::Intrinsic::vaend:
    if (!llvm::isa<llvm::ConstantPointerNull>(I.getArgOperand(0))) {
      Out << "0; va_end(*(va_list*)";
      writeOperand(I.getArgOperand(0));
      Out << ')';
    } else {
      Out << "va_end(*(va_list*)0)";
    }
    return true;
  case llvm::Intrinsic::vacopy:
    Out << "0; ";
    Out << "va_copy(*(va_list*)";
    writeOperand(I.getArgOperand(0));
    Out << ", *(va_list*)";
    writeOperand(I.getArgOperand(1));
    Out << ')';
    return true;
  case llvm::Intrinsic::returnaddress:
    Out << "__builtin_return_address(";
    writeOperand(I.getArgOperand(0));
    Out << ')';
    return true;
  case llvm::Intrinsic::frameaddress:
    Out << "__builtin_frame_address(";
    writeOperand(I.getArgOperand(0));
    Out << ')';
    return true;
  case llvm::Intrinsic::powi:
    Out << "__builtin_powi(";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ')';
    return true;
  case llvm::Intrinsic::fabs:
    Out << "__builtin_fabs(";
    writeOperand(I.getArgOperand(0));
    Out << ')';
    return true;
  case llvm::Intrinsic::setjmp:
    Out << "setjmp(*(jmp_buf*)";
    writeOperand(I.getArgOperand(0));
    Out << ')';
    return true;
  case llvm::Intrinsic::longjmp:
    Out << "longjmp(*(jmp_buf*)";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ')';
    return true;
  case llvm::Intrinsic::memset:
    Out << "Memset(";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ", ";
    writeOperand(I.getArgOperand(2));
    Out << ')';
    return true;
  case llvm::Intrinsic::prefetch:
    Out << "LLVM_PREFETCH((const void *)";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ", ";
    writeOperand(I.getArgOperand(2));
    Out << ")";
    return true;
  case llvm::Intrinsic::stacksave:
    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
    // to work around GCC bugs (see PR1809).
    Out << "0; *((void**)&" << GetValueName(&I)
        << ") = __builtin_stack_save()";
    return true;
  case llvm::Intrinsic::x86_sse_cmp_ss:
  case llvm::Intrinsic::x86_sse_cmp_ps:
  case llvm::Intrinsic::x86_sse2_cmp_sd:
  case llvm::Intrinsic::x86_sse2_cmp_pd:
    Out << '(';
    printType(Out, I.getType());
    Out << ')';
    // Multiple GCC builtins multiplex onto this intrinsic.
    switch (llvm::cast<llvm::ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
    default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
    case 0: Out << "__builtin_ia32_cmpeq"; break;
    case 1: Out << "__builtin_ia32_cmplt"; break;
    case 2: Out << "__builtin_ia32_cmple"; break;
    case 3: Out << "__builtin_ia32_cmpunord"; break;
    case 4: Out << "__builtin_ia32_cmpneq"; break;
    case 5: Out << "__builtin_ia32_cmpnlt"; break;
    case 6: Out << "__builtin_ia32_cmpnle"; break;
    case 7: Out << "__builtin_ia32_cmpord"; break;
    }
    if (ID == llvm::Intrinsic::x86_sse_cmp_ps || ID == llvm::Intrinsic::x86_sse2_cmp_pd)
      Out << 'p';
    else
      Out << 's';
    if (ID == llvm::Intrinsic::x86_sse_cmp_ss || ID == llvm::Intrinsic::x86_sse_cmp_ps)
      Out << 's';
    else
      Out << 'd';

    Out << "(";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ")";
    return true;
  case llvm::Intrinsic::ppc_altivec_lvsl:
    Out << '(';
    printType(Out, I.getType());
    Out << ')';
    Out << "__builtin_altivec_lvsl(0, (void*)";
    writeOperand(I.getArgOperand(0));
    Out << ")";
    return true;
  case llvm::Intrinsic::uadd_with_overflow:
  case llvm::Intrinsic::sadd_with_overflow:
  case llvm::Intrinsic::umul_with_overflow:
    Out << GetValueName(I.getCalledFunction()) << "(";
    writeOperand(I.getArgOperand(0));
    Out << ", ";
    writeOperand(I.getArgOperand(1));
    Out << ")";
    return true;
  case llvm::Intrinsic::trap:
    Out << "abort()";
    return true;
  case llvm::Intrinsic::objectsize:
    return true;
  case llvm::Intrinsic::readcyclecounter:
    Out << "__clock()";
    return true;
  }
}


//TODO: assumptions about what consume arguments from the call are likely wrong
//      handle communitivity
void CWriter::visitInlineAsm(llvm::CallInst &CI) {
  assert(!"Inline assembly not supported");
}

void CWriter::visitAllocaInst(llvm::AllocaInst &I) {
  Out << '(';
  printType(Out, I.getType());
  Out << ") alloca(sizeof(";
  printType(Out, I.getType()->getElementType());
  Out << ')';
  if (I.isArrayAllocation()) {
    Out << " * " ;
    writeOperand(I.getOperand(0));
  }
  Out << ')';
}

void CWriter::printGEPExpression(llvm::Value *Ptr, llvm::gep_type_iterator I,
                                 llvm::gep_type_iterator E, bool Static) {

  // If there are no indices, just print out the pointer.
  if (I == E) {
    writeOperand(Ptr);
    return;
  }

  // Find out if the last index is into a vector.  If so, we have to print this
  // specially.  Since vectors can't have elements of indexable type, only the
  // last index could possibly be of a vector element.
  llvm::VectorType *LastIndexIsVector = 0;
  {
    for (llvm::gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
      LastIndexIsVector = llvm::dyn_cast<llvm::VectorType>(*TmpI);
#else // LLVM 4.0+
    LastIndexIsVector = llvm::dyn_cast<llvm::VectorType>(TmpI.getIndexedType());
#endif
  }

  Out << "(";

  // If the last index is into a vector, we can't print it as &a[i][j] because
  // we can't index into a vector with j in GCC.  Instead, emit this as
  // (((float*)&a[i])+j)
  if (LastIndexIsVector) {
    Out << "((";
    printType(Out, llvm::PointerType::getUnqual(LastIndexIsVector->getElementType()));
    Out << ")(";
  }

  Out << '&';

  // If the first index is 0 (very typical) we can do a number of
  // simplifications to clean up the code.
  llvm::Value *FirstOp = I.getOperand();
  if (!llvm::isa<llvm::Constant>(FirstOp) || !llvm::cast<llvm::Constant>(FirstOp)->isNullValue()) {
    // First index isn't simple, print it the hard way.
    writeOperand(Ptr);
  } else {
    ++I;  // Skip the zero index.

    // Okay, emit the first operand. If Ptr is something that is already address
    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
    if (isAddressExposed(Ptr)) {
      writeOperandInternal(Ptr, Static);
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    } else if (I != E && (*I)->isStructTy()) {
#else // LLVM 4.0+
    } else if (I != E && I.isStruct()) {
#endif
      // If we didn't already emit the first operand, see if we can print it as
      // P->f instead of "P[0].f"
      writeOperand(Ptr);
      Out << "->field" << llvm::cast<llvm::ConstantInt>(I.getOperand())->getZExtValue();
      ++I;  // eat the struct index as well.
    } else {
      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
      Out << "(*";
      writeOperand(Ptr);
      Out << ")";
    }
  }

  for (; I != E; ++I) {
#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9
    llvm::Type *type = *I;
#else // LLVM 4.0+
    llvm::Type *type = I.getIndexedType();
#endif
    if (type->isStructTy()) {
      Out << ".field" << llvm::cast<llvm::ConstantInt>(I.getOperand())->getZExtValue();
    } else if (type->isArrayTy()) {
      Out << ".array[";
      writeOperandWithCast(I.getOperand(), llvm::Instruction::GetElementPtr);
      Out << ']';
    } else if (!type->isVectorTy()) {
      Out << '[';
      writeOperandWithCast(I.getOperand(), llvm::Instruction::GetElementPtr);
      Out << ']';
    } else {
      // If the last index is into a vector, then print it out as "+j)".  This
      // works with the 'LastIndexIsVector' code above.
      if (llvm::isa<llvm::Constant>(I.getOperand()) &&
          llvm::cast<llvm::Constant>(I.getOperand())->isNullValue()) {
        Out << "))";  // avoid "+0".
      } else {
        Out << ")+(";
        writeOperandWithCast(I.getOperand(), llvm::Instruction::GetElementPtr);
        Out << "))";
      }
    }
  }
  Out << ")";
}

void CWriter::writeMemoryAccess(llvm::Value *Operand, llvm::Type *OperandType,
                                bool IsVolatile, unsigned Alignment) {
  assert(!llvm::isa<llvm::VectorType>(OperandType));
  bool IsUnaligned = Alignment &&
    Alignment < TD->getABITypeAlignment(OperandType);

  llvm::IntegerType *ITy = llvm::dyn_cast<llvm::IntegerType>(OperandType);
  if (!IsUnaligned)
    Out << '*';
  if (IsVolatile || IsUnaligned) {
    Out << "((";
    if (IsUnaligned && ITy && (ITy->getBitWidth() > 64))
      Out << "iN_" << ITy->getBitWidth() << "_align_" << Alignment << " *)";
    else {
      if (IsUnaligned)
        Out << "struct __attribute__ ((packed, aligned(" << Alignment << "))) {";
      printType(Out, OperandType, false, IsUnaligned ? "data" : "volatile*");
      if (IsUnaligned) {
        Out << "; } ";
        if (IsVolatile) Out << "volatile ";
        Out << "*";
      }
      Out << ")";
    }
  }

  writeOperand(Operand);

  if (IsVolatile || IsUnaligned) {
    Out << ')';
    if (IsUnaligned)
      Out << "->data";
  }
}

void CWriter::visitLoadInst(llvm::LoadInst &I) {
  llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getType());
  if (VT != NULL) {
      Out << "__load<" << I.getAlignment() << ">(";
      writeOperand(I.getOperand(0));
      Out << ")";
      return;
  }

  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
                    I.getAlignment());
}

void CWriter::visitStoreInst(llvm::StoreInst &I) {
  llvm::VectorType *VT = llvm::dyn_cast<llvm::VectorType>(I.getOperand(0)->getType());
  if (VT != NULL) {
      Out << "__store<" << I.getAlignment() << ">(";
      writeOperand(I.getOperand(1));
      Out << ", ";
      writeOperand(I.getOperand(0));
      Out << ")";
      return;
  }

  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
                    I.isVolatile(), I.getAlignment());
  Out << " = ";
  llvm::Value *Operand = I.getOperand(0);
  llvm::Constant *BitMask = 0;
  if (llvm::IntegerType* ITy = llvm::dyn_cast<llvm::IntegerType>(Operand->getType()))
    if (!ITy->isPowerOf2ByteWidth())
      // We have a bit width that doesn't match an even power-of-2 byte
      // size. Consequently we must & the value with the type's bit mask
      BitMask = llvm::ConstantInt::get(ITy, ITy->getBitMask());
  if (BitMask)
    Out << "((";
  writeOperand(Operand);
  if (BitMask) {
    Out << ") & ";
    printConstant(BitMask, false);
    Out << ")";
  }
}

void CWriter::visitGetElementPtrInst(llvm::GetElementPtrInst &I) {
  printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
                     gep_type_end(I), false);
}

void CWriter::visitVAArgInst(llvm::VAArgInst &I) {
  Out << "va_arg(*(va_list*)";
  writeOperand(I.getOperand(0));
  Out << ", ";
  printType(Out, I.getType());
  Out << ");\n ";
}

void CWriter::visitInsertElementInst(llvm::InsertElementInst &I) {
#if 0
  Type *EltTy = I.getType()->getElementType();
  writeOperand(I.getOperand(0));
  Out << ";\n  ";
  Out << "((";
  printType(Out, llvm::PointerType::getUnqual(EltTy));
  Out << ")(&" << GetValueName(&I) << "))[";
  writeOperand(I.getOperand(2));
  Out << "] = (";
  writeOperand(I.getOperand(1));
  Out << ")";
#else
  writeOperand(I.getOperand(0));
  Out << ";\n  ";
  Out << "__insert_element(&" << GetValueName(&I) << ", ";
  writeOperand(I.getOperand(2));
  Out << ", ";
  writeOperand(I.getOperand(1));
  Out << ")";
#endif
}

void CWriter::visitExtractElementInst(llvm::ExtractElementInst &I) {
  // We know that our operand is not inlined.
#if 0
  Out << "((";
  Type *EltTy =
    llvm::cast<llvm::VectorType>(I.getOperand(0)->getType())->getElementType();
  printType(Out, llvm::PointerType::getUnqual(EltTy));
  Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
  writeOperand(I.getOperand(1));
  Out << "]";
#else
  Out << "(__extract_element(";
  writeOperand(I.getOperand(0));
  Out << ", ";
  writeOperand(I.getOperand(1));
  Out << "))";
#endif
}

void CWriter::visitShuffleVectorInst(llvm::ShuffleVectorInst &SVI) {
  printType(Out, SVI.getType());
  Out << "(";
  llvm::VectorType *VT = SVI.getType();
  unsigned NumElts = VT->getNumElements();
  llvm::Type *EltTy = VT->getElementType();
  llvm::VectorType *OpTy = llvm::dyn_cast<llvm::VectorType>(SVI.getOperand(0)->getType());
  unsigned OpElts = OpTy->getNumElements();

  for (unsigned i = 0; i != NumElts; ++i) {
    if (i) Out << ", ";
    int SrcVal = SVI.getMaskValue(i);
    if ((unsigned)SrcVal >= 2*OpElts) {
      Out << " 0/*undef*/ ";
    } else {
      llvm::Value *Op = SVI.getOperand((unsigned)SrcVal >= OpElts);
      SrcVal &= OpElts - 1;

      if (llvm::isa<llvm::ConstantVector>(Op)) {
        printConstant(llvm::cast<llvm::ConstantVector>(Op)->getOperand(SrcVal),
                      false);
      } else if (llvm::isa<llvm::ConstantAggregateZero>(Op) || llvm::isa<llvm::UndefValue>(Op)) {
        Out << "0";
      }
      else {
        // Do an extractelement of this value from the appropriate input.
        Out << " \n#if defined(KNC) \n";
        if (OpElts != 1) { // all __vec16_* have overloaded operator []
          Out << "(" << GetValueName(Op)
              << ")[" << SrcVal << "]";
        }
        else { // but __vec1_* don't have it
          Out << "((";
          printType(Out, llvm::PointerType::getUnqual(EltTy));
          Out << ")(&" << GetValueName(Op)
              << "))[" << SrcVal << "]";
        }
        Out << " \n#else \n";
        Out << "((";
        printType(Out, llvm::PointerType::getUnqual(EltTy));
        Out << ")(&" << GetValueName(Op)
            << "))[" << SrcVal << "]";
        Out << " \n#endif \n";
      }
    }
  }
  Out << ")";
}

void CWriter::visitInsertValueInst(llvm::InsertValueInst &IVI) {
  // Start by copying the entire aggregate value into the result variable.
  writeOperand(IVI.getOperand(0));
  Out << ";\n  ";

  // Then do the insert to update the field.
  Out << GetValueName(&IVI);
  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
       i != e; ++i) {
    llvm::Type *IndexedTy = (b == i) ? IVI.getOperand(0)->getType() :
      llvm::ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
                                             llvm::makeArrayRef(b, i));
    if (IndexedTy->isArrayTy())
      Out << ".array[" << *i << "]";
    else
      Out << ".field" << *i;
  }
  Out << " = ";
  writeOperand(IVI.getOperand(1));
}

void CWriter::visitExtractValueInst(llvm::ExtractValueInst &EVI) {
  Out << "(";
  if (llvm::isa<llvm::UndefValue>(EVI.getOperand(0))) {
    // FIXME: need to handle these--a 0 initializer won't do...
    assert(!llvm::isa<llvm::VectorType>(EVI.getType()));
    Out << "(";
    printType(Out, EVI.getType());
    Out << ") 0/*UNDEF*/";
  } else {
    Out << GetValueName(EVI.getOperand(0));
    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
         i != e; ++i) {
      llvm::Type *IndexedTy = (b == i) ? EVI.getOperand(0)->getType() :
        llvm::ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
                                               llvm::makeArrayRef(b, i));
      if (IndexedTy->isArrayTy())
        Out << ".array[" << *i << "]";
      else
        Out << ".field" << *i;
    }
  }
  Out << ")";
}

void CWriter::visitAtomicRMWInst(llvm::AtomicRMWInst &AI) {
    Out << "(";
    Out << "__atomic_";
    switch (AI.getOperation()) {
    default: llvm_unreachable("Unhandled case in visitAtomicRMWInst!");
    case llvm::AtomicRMWInst::Add:   Out << "add";  break;
    case llvm::AtomicRMWInst::Sub:   Out << "sub";  break;
    case llvm::AtomicRMWInst::Xchg:  Out << "xchg"; break;
    case llvm::AtomicRMWInst::And:   Out << "and";  break;
    case llvm::AtomicRMWInst::Nand:  Out << "nand"; break;
    case llvm::AtomicRMWInst::Or:    Out << "or";   break;
    case llvm::AtomicRMWInst::Xor:   Out << "xor";  break;
    case llvm::AtomicRMWInst::Min:   Out << "min";  break;
    case llvm::AtomicRMWInst::Max:   Out << "max";  break;
    case llvm::AtomicRMWInst::UMin:  Out << "umin"; break;
    case llvm::AtomicRMWInst::UMax:  Out << "umax"; break;
    }
    Out << "(";
    writeOperand(AI.getOperand(0));
    Out << ", ";
    writeOperand(AI.getOperand(1));
    Out << "))";
}

void CWriter::visitAtomicCmpXchgInst(llvm::AtomicCmpXchgInst &ACXI) {
    Out << "(";
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
    printType(Out, ACXI.getType(), false);
    Out << "::init("; // LLVM cmpxchg returns a struct, so we need make an assighment properly
#endif
    Out << "__atomic_cmpxchg(";
    writeOperand(ACXI.getPointerOperand());
    Out << ", ";
    writeOperand(ACXI.getCompareOperand());
    Out << ", ";
    writeOperand(ACXI.getNewValOperand());
    Out << ")";
#if ISPC_LLVM_VERSION >= ISPC_LLVM_3_5 // LLVM 3.5+
    Out << ", true /* There is no way to learn the value of this bit inside ISPC, so making it constant */)";
#endif
    Out << ")";
}

///////////////////////////////////////////////////////////////////////////
// SmearCleanupPass

class SmearCleanupPass : public llvm::BasicBlockPass {
public:
    SmearCleanupPass(llvm::Module *m, int width)
        : BasicBlockPass(ID) { module = m; vectorWidth = width; }

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "Smear Cleanup Pass"; }
#else // LLVM 4.0+
    llvm::StringRef getPassName() const { return "Smear Cleanup Pass"; }
#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
    llvm::Module *module;
    unsigned int vectorWidth;

private:
    unsigned int ChainLength(llvm::InsertElementInst *inst) const;
    llvm::Value *getInsertChainSmearValue(llvm::Instruction* inst) const;
    llvm::Value *getShuffleSmearValue(llvm::Instruction* inst) const;
};


char SmearCleanupPass::ID = 0;


unsigned int
SmearCleanupPass::ChainLength(llvm::InsertElementInst *inst) const {
    unsigned int length = 0;
    while (inst != NULL) {
        ++length;
        inst = llvm::dyn_cast<llvm::InsertElementInst>(inst->getOperand(0));
    }
    return length;
}


llvm::Value *
SmearCleanupPass::getInsertChainSmearValue(llvm::Instruction* inst) const {
    // TODO: we don't check indexes where we do insertion, so we may trigger
    // transformation for a wrong chain.
    // This way of doing broadcast is obsolete and should be probably removed
    // some day.

    llvm::InsertElementInst *insertInst =
        llvm::dyn_cast<llvm::InsertElementInst>(inst);
    if (!insertInst) {
        return NULL;
    }

    // We consider only chians of vectorWidth length.
    if (ChainLength(insertInst) != vectorWidth) {
        return NULL;
    }

    // FIXME: we only want to do this to vectors with width equal to
    // the target vector width.  But we can't easily get that here, so
    // for now we at least avoid one case where we definitely don't
    // want to do this.
    llvm::VectorType *vt = llvm::dyn_cast<llvm::VectorType>(insertInst->getType());
    if (vt->getNumElements() == 1) {
        return NULL;
    }

    llvm::Value *smearValue = NULL;
    while (insertInst != NULL) {
        // operand 1 is inserted value
        llvm::Value *insertValue = insertInst->getOperand(1);
        if (smearValue == NULL) {
            smearValue = insertValue;
        }
        else if (smearValue != insertValue) {
            return NULL;
        }

        // operand 0 is a vector to insert into.
        insertInst =
            llvm::dyn_cast<llvm::InsertElementInst>(insertInst->getOperand(0));
    }
    assert(smearValue != NULL);

    return smearValue;
}


llvm::Value *
SmearCleanupPass::getShuffleSmearValue(llvm::Instruction* inst) const {
    llvm::ShuffleVectorInst *shuffleInst =
        llvm::dyn_cast<llvm::ShuffleVectorInst>(inst);
    if (!shuffleInst) {
        return NULL;
    }

    llvm::Constant* mask =
        llvm::dyn_cast<llvm::Constant>(shuffleInst->getOperand(2));

    // Check that the shuffle is a broadcast of the element of the first vector,
    // i.e. mask vector is vector with equal elements of expected size.
    if (!(mask &&
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
         (mask->isNullValue() || (shuffleInst->getMask()->getType()->isVectorTy() && llvm::dyn_cast<llvm::ConstantVector>(shuffleInst->getMask())->getSplatValue() != 0 ) ) &&
#else
         (mask->isNullValue() || (shuffleInst->getMask()->getSplatValue() != 0))&&
#endif
          llvm::dyn_cast<llvm::VectorType>(mask->getType())->getNumElements() == vectorWidth)) {
        return NULL;
    }


    llvm::InsertElementInst *insertInst =
        llvm::dyn_cast<llvm::InsertElementInst>(shuffleInst->getOperand(0));

    // Check that it's an InsertElementInst that inserts a value to first element.
    if (!(insertInst &&
          llvm::isa<llvm::Constant>(insertInst->getOperand(2)) &&
          llvm::dyn_cast<llvm::Constant>(insertInst->getOperand(2))->isNullValue())) {

        // We can't extract element from vec1
        llvm::VectorType *operandVec = llvm::dyn_cast<llvm::VectorType>(shuffleInst->getOperand(0)->getType());
        if (operandVec && operandVec->getNumElements() == 1)
          return NULL;

        // Insert ExtractElementInstr to get value for smear

        llvm::Function *extractFunc = module->getFunction("__extract_element");

         if (extractFunc == NULL) {
            // Declare the __extract_element function if needed; it takes a vector and
            // a scalar parameter and returns a scalar of the vector parameter type.
            llvm::Constant *ef =
                module->getOrInsertFunction("__extract_element",
                                            shuffleInst->getOperand(0)->getType()->getVectorElementType(),
                                            shuffleInst->getOperand(0)->getType(),
                                            llvm::IntegerType::get(module->getContext(), 32), NULL);
            extractFunc = llvm::dyn_cast<llvm::Function>(ef);
            assert(extractFunc != NULL);
            extractFunc->setDoesNotThrow();
            extractFunc->setOnlyReadsMemory();
        }

        if (extractFunc == NULL) {
            return NULL;
        }
        llvm::Instruction *extractCall =
              llvm::ExtractElementInst::Create(shuffleInst->getOperand(0),
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
                         // mask is of VectorType
                         llvm::dyn_cast<llvm::ConstantVector>(mask)->getSplatValue(),
#else
                         mask->getSplatValue(),
#endif
                         "__extract_element", inst);
        return extractCall;
    }

    llvm::Value *result = insertInst->getOperand(1);

    return result;
}


bool
SmearCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;

 restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::Value *smearValue = NULL;

        if (!(smearValue = getInsertChainSmearValue(&*iter)) &&
            !(smearValue = getShuffleSmearValue(&*iter))) {
            continue;
        }

        llvm::Type *smearType = smearValue->getType();
        const char *smearFuncName = lGetTypedFunc("smear", smearType, vectorWidth);
        if (smearFuncName != NULL) {
            llvm::Function *smearFunc = module->getFunction(smearFuncName);
            if (smearFunc == NULL) {
                // Declare the smear function if needed; it takes a single
                // scalar parameter and returns a vector of the same
                // parameter type.
                llvm::Constant *sf =
                    module->getOrInsertFunction(smearFuncName, iter->getType(),
                                                smearType, NULL);
                smearFunc = llvm::dyn_cast<llvm::Function>(sf);
                assert(smearFunc != NULL);
                smearFunc->setDoesNotThrow();
                smearFunc->setDoesNotAccessMemory();
            }

            assert(smearFunc != NULL);
            llvm::Value *args[1] = { smearValue };
            llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[1]);
            llvm::Instruction *smearCall =
                llvm::CallInst::Create(smearFunc, argArray, LLVMGetName(smearValue, "_smear"),
                                 (llvm::Instruction *)NULL);

            ReplaceInstWithInst(&*iter, smearCall);

            modifiedAny = true;
            goto restart;
        }
    }

    return modifiedAny;
}


///////////////////////////////////////////////////////////////////////////
// AndCmpCleanupPass

class AndCmpCleanupPass : public llvm::BasicBlockPass {
public:
    AndCmpCleanupPass()
        : BasicBlockPass(ID) { }

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "AndCmp Cleanup Pass"; }
#else // LLVM 4.0+
    llvm::StringRef getPassName() const { return "AndCmp Cleanup Pass"; }
#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

    static char ID;
};

char AndCmpCleanupPass::ID = 0;

// Look for ANDs of masks where one of the operands is a vector compare; we
// can turn these into specialized calls to masked vector compares and
// thence eliminate the AND.  For example, rather than emitting
// __and(__less(a, b), c), we will emit __less_and_mask(a, b, c).
bool
AndCmpCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;

 restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        // See if we have an AND instruction
        llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(&*iter);
        if (bop == NULL || bop->getOpcode() != llvm::Instruction::And)
            continue;

        // Make sure it's a vector AND
        if (llvm::isa<llvm::VectorType>(bop->getType()) == false)
            continue;

        // We only care about ANDs of the mask type, not, e.g. ANDs of
        // int32s vectors.
        if (bop->getType() != LLVMTypes::MaskType)
            continue;

        // Now see if either of the operands to the AND is a comparison
        for (int i = 0; i < 2; ++i) {
            llvm::Value *op = bop->getOperand(i);
            llvm::CmpInst *opCmp = llvm::dyn_cast<llvm::CmpInst>(op);
            if (opCmp == NULL)
                continue;

            // We have a comparison.  However, we also need to make sure
            // that it's not comparing two mask values; those can't be
            // simplified to something simpler.
            if (opCmp->getOperand(0)->getType() == LLVMTypes::MaskType)
                break;

            // Success!  Go ahead and replace the AND with a call to the
            // "__and_mask" variant of the comparison function for this
            // operand.
            std::string funcName = lPredicateToString(opCmp->getPredicate());
            funcName += "_";
            funcName += lTypeToSuffix(opCmp->getOperand(0)->getType());
            funcName += "_and_mask";

            llvm::Function *andCmpFunc = m->module->getFunction(funcName);
            if (andCmpFunc == NULL) {
                // Declare the function if needed; the first two arguments
                // are the same as the two arguments to the compare we're
                // replacing and the third argument is the mask type.
                llvm::Type *cmpOpType = opCmp->getOperand(0)->getType();
                llvm::Constant *acf =
                    m->module->getOrInsertFunction(funcName, LLVMTypes::MaskType,
                                                   cmpOpType, cmpOpType,
                                                   LLVMTypes::MaskType, NULL);
                andCmpFunc = llvm::dyn_cast<llvm::Function>(acf);
                Assert(andCmpFunc != NULL);
                andCmpFunc->setDoesNotThrow();
                andCmpFunc->setDoesNotAccessMemory();
            }

            // Set up the function call to the *_and_mask function; the
            // mask value passed in is the other operand to the AND.
            llvm::Value *args[3] = { opCmp->getOperand(0), opCmp->getOperand(1),
                                     bop->getOperand(i ^ 1) };
            llvm::ArrayRef<llvm::Value *> argArray(&args[0], &args[3]);
            llvm::Instruction *cmpCall =
                llvm::CallInst::Create(andCmpFunc, argArray,
                                       LLVMGetName(bop, "_and_mask"),
                                       (llvm::Instruction *)NULL);

            // And replace the original AND instruction with it.
            llvm::ReplaceInstWithInst(&*iter, cmpCall);

            modifiedAny = true;
            goto restart;
        }
    }

    return modifiedAny;
}

///////////////////////////////////////////////////////////////////////////
// MaskOpsCleanupPass

/** This pass does various peephole improvements to mask modification
    operations.  In particular, it converts mask XORs with "all true" to
    calls to __not() and replaces operations like and(not(a), b) to
    __and_not1(a, b) (and similarly if the second operand has not applied
    to it...)
 */
class MaskOpsCleanupPass : public llvm::BasicBlockPass {
public:
    MaskOpsCleanupPass(llvm::Module *m)
        : BasicBlockPass(ID) {
        llvm::Type *mt = LLVMTypes::MaskType;

        // Declare the __not, __and_not1, and __and_not2 functions that we
        // expect the target to end up providing.
        notFunc =
            llvm::dyn_cast<llvm::Function>(m->getOrInsertFunction("__not", mt, mt, NULL));
        assert(notFunc != NULL);
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
        notFunc->addFnAttr(llvm::Attributes::NoUnwind);
        notFunc->addFnAttr(llvm::Attributes::ReadNone);
#else /* LLVM 3.3+ */
        notFunc->addFnAttr(llvm::Attribute::NoUnwind);
        notFunc->addFnAttr(llvm::Attribute::ReadNone);
#endif

        andNotFuncs[0] =
            llvm::dyn_cast<llvm::Function>(m->getOrInsertFunction("__and_not1", mt, mt, mt,
                                                      NULL));
        assert(andNotFuncs[0] != NULL);
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
        andNotFuncs[0]->addFnAttr(llvm::Attributes::NoUnwind);
        andNotFuncs[0]->addFnAttr(llvm::Attributes::ReadNone);
#else /* LLVM 3.3+ */
        andNotFuncs[0]->addFnAttr(llvm::Attribute::NoUnwind);
        andNotFuncs[0]->addFnAttr(llvm::Attribute::ReadNone);
#endif
        andNotFuncs[1] =
            llvm::dyn_cast<llvm::Function>(m->getOrInsertFunction("__and_not2", mt, mt, mt,
                                                      NULL));
        assert(andNotFuncs[1] != NULL);
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
        andNotFuncs[1]->addFnAttr(llvm::Attributes::NoUnwind);
        andNotFuncs[1]->addFnAttr(llvm::Attributes::ReadNone);
#else /* LLVM 3.3+ */
        andNotFuncs[1]->addFnAttr(llvm::Attribute::NoUnwind);
        andNotFuncs[1]->addFnAttr(llvm::Attribute::ReadNone);
#endif
    }

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_9 // <= 3.9
    const char *getPassName() const { return "MaskOps Cleanup Pass"; }
#else // LLVM 4.0+
    llvm::StringRef getPassName() const { return "MaskOps Cleanup Pass"; }
#endif
    bool runOnBasicBlock(llvm::BasicBlock &BB);

private:
    llvm::Value *lGetNotOperand(llvm::Value *v) const;

    llvm::Function *notFunc, *andNotFuncs[2];

    static char ID;
};

char MaskOpsCleanupPass::ID = 0;


/** Returns true if the given value is a compile-time constant vector of
    i1s with all elements 'true'.
*/
static bool
lIsAllTrue(llvm::Value *v) {
    if (llvm::ConstantVector *cv = llvm::dyn_cast<llvm::ConstantVector>(v)) {
        llvm::ConstantInt *ci;
        return (cv->getSplatValue() != NULL &&
                (ci = llvm::dyn_cast<llvm::ConstantInt>(cv->getSplatValue())) != NULL &&
                ci->isOne());
    }

    if (llvm::ConstantDataVector *cdv = llvm::dyn_cast<llvm::ConstantDataVector>(v)) {
        llvm::ConstantInt *ci;
        return (cdv->getSplatValue() != NULL &&
                (ci = llvm::dyn_cast<llvm::ConstantInt>(cdv->getSplatValue())) != NULL &&
                ci->isOne());
    }

    return false;
}


/** Checks to see if the given value is the NOT of some other value.  If
    so, it returns the operand of the NOT; otherwise returns NULL.
 */
llvm::Value *
    MaskOpsCleanupPass::lGetNotOperand(llvm::Value *v) const {
    if (llvm::CallInst *ci = llvm::dyn_cast<llvm::CallInst>(v))
        if (ci->getCalledFunction() == notFunc)
            // Direct call to __not()
            return ci->getArgOperand(0);

    if (llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(v))
        if (bop->getOpcode() == llvm::Instruction::Xor &&
            lIsAllTrue(bop->getOperand(1)))
            // XOR of all-true vector.
            return bop->getOperand(0);

    return NULL;
}


bool
MaskOpsCleanupPass::runOnBasicBlock(llvm::BasicBlock &bb) {
    bool modifiedAny = false;

 restart:
    for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) {
        llvm::BinaryOperator *bop = llvm::dyn_cast<llvm::BinaryOperator>(&*iter);
        if (bop == NULL)
            continue;

        if (bop->getType() != LLVMTypes::MaskType)
            continue;

        if (bop->getOpcode() == llvm::Instruction::Xor) {
            // Check for XOR with all-true values
            if (lIsAllTrue(bop->getOperand(1))) {
                llvm::Value *val = bop->getOperand(0);
                // Note that ArrayRef takes reference to an object, which must live
                // long enough, so passing return value of getOperand directly is
                // incorrect and it actually causes crashes with gcc 4.7 and later.
                llvm::ArrayRef<llvm::Value *> arg(val);
                llvm::CallInst *notCall = llvm::CallInst::Create(notFunc, arg,
                                                     bop->getName());
                ReplaceInstWithInst(&*iter, notCall);
                modifiedAny = true;
                goto restart;
            }
        }
        else if (bop->getOpcode() == llvm::Instruction::And) {
            // Check each of the operands to see if they have NOT applied
            // to them.
            for (int i = 0; i < 2; ++i) {
                if (llvm::Value *notOp = lGetNotOperand(bop->getOperand(i))) {
                    // In notOp we have the target of the NOT operation;
                    // put it in its appropriate spot in the operand array.
                    // Copy in the other operand directly.
                    llvm::Value *args[2];
                    args[i]     = notOp;
                    args[i ^ 1] = bop->getOperand(i ^ 1);
                    llvm::ArrayRef<llvm::Value *> argsRef(&args[0], 2);

                    // Call the appropriate __and_not* function.
                    llvm::CallInst *andNotCall =
                        llvm::CallInst::Create(andNotFuncs[i], argsRef, bop->getName());

                    ReplaceInstWithInst(&*iter, andNotCall);
                    modifiedAny = true;
                    goto restart;
                }
            }
        }
    }

    return modifiedAny;
}


//===----------------------------------------------------------------------===//
//                       External Interface declaration
//===----------------------------------------------------------------------===//

bool
WriteCXXFile(llvm::Module *module, const char *fn, int vectorWidth,
             const char *includeName) {

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_6 // 3.2, 3.3, 3.4, 3.5, 3.6
    llvm::PassManager pm;
#else // LLVM 3.7+
    llvm::legacy::PassManager pm;
#endif
#if 0
    if (const llvm::TargetData *td = targetMachine->getTargetData())
        pm.add(new llvm::TargetData(*td));
    else
        pm.add(new llvm::TargetData(module));
#endif

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_3 // 3.2, 3.3
    int flags = 0;
#else // LLVM 3.4+
    llvm::sys::fs::OpenFlags flags = llvm::sys::fs::F_None;
#endif

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // 3.2, 3.3, 3.4, 3.5
    std::string error;
#else // LLVM 3.6+
    std::error_code error;
#endif

    llvm::tool_output_file *of = new llvm::tool_output_file(fn, error, flags);

#if ISPC_LLVM_VERSION <= ISPC_LLVM_3_5 // 3.2, 3.3, 3.4, 3.5
    if (error.size()) {
#else // LLVM 3.6+
    if (error) {
#endif
        fprintf(stderr, "Error opening output file \"%s\".\n", fn);
        return false;
    }

    llvm::formatted_raw_ostream fos(of->os());

    pm.add(llvm::createGCLoweringPass());
    pm.add(llvm::createLowerInvokePass());
    pm.add(llvm::createCFGSimplificationPass());   // clean up after lower invoke.
    pm.add(new SmearCleanupPass(module, vectorWidth));
    pm.add(new AndCmpCleanupPass());
    pm.add(new MaskOpsCleanupPass(module));
    pm.add(llvm::createDeadCodeEliminationPass()); // clean up after smear pass
//CO    pm.add(llvm::createPrintModulePass(&fos));
    pm.add(new CWriter(fos, includeName, vectorWidth));
#if ISPC_LLVM_VERSION == ISPC_LLVM_3_2
    // This interface is depricated for 3.3+
    pm.add(llvm::createGCInfoDeleter());
#endif
//CO    pm.add(llvm::createVerifierPass());

    pm.run(*module);

    return true;
}