commit 18af5226ba8ddbcd9bb9f8eebbbc7d5f515d71b6 Author: Matt Pharr Date: Tue Jun 21 06:23:29 2011 -0700 Initial commit. diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..f3d74a9a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +*~ diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000..918a5b57 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,116 @@ +Copyright (c) 2010-2011, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +=========================================================================== +Copyrights and Licenses for Third Party Software Distrubted with +The Intel(r) SPMD Program Compiler +=========================================================================== + +ISPC incorporates code from the Syrah library, which is covered by the +following license: + +Copyright (c) 2009, Stanford University, and authors listed below. +All rights reserved. + +Original authors: + Solomon Boulos + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +Neither the name of Stanford University nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------------------------------------------------------------- + +Binary distributions of ISPC are linked with the LLVM libraries, which are +covered by the following license: + +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2010 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..cc9ee60c --- /dev/null +++ b/Makefile @@ -0,0 +1,118 @@ +# +# ispc Makefile +# + +ARCH = $(shell uname) + +CLANG=clang +LLVM_LIBS=$(shell llvm-config --ldflags --libs) -lpthread -ldl +LLVM_CXXFLAGS=$(shell llvm-config --cppflags) +LLVM_VERSION_DEF=-DLLVM_$(shell llvm-config --version | sed s/\\./_/) + +BUILD_DATE=$(shell date +%Y%m%d) +BUILD_VERSION=$(shell git log | head -1) + +CXX=g++ +CPP=cpp +CXXFLAGS=-g3 $(LLVM_CXXFLAGS) -I. -Iobjs/ -Wall $(LLVM_VERSION_DEF) \ + -DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\"" + +LDFLAGS= +ifeq ($(ARCH),Linux) + # try to link everything statically under Linux (including libstdc++) so + # that the binaries we generate will be portable across distributions... + LDFLAGS=-static -L/usr/lib/gcc/x86_64-linux-gnu/4.4 +endif + +LEX=flex +YACC=bison -d -v -t + +########################################################################### + +CXX_SRC=builtins.cpp ctx.cpp decl.cpp expr.cpp ispc.cpp \ + llvmutil.cpp main.cpp module.cpp opt.cpp stmt.cpp sym.cpp type.cpp \ + util.cpp +HEADERS=builtins.h ctx.h decl.h expr.h ispc.h llvmutil.h module.h \ + opt.h stmt.h sym.h type.h util.h +STDLIB_SRC=stdlib-avx.ll stdlib-sse2.ll stdlib-sse4.ll stdlib-sse4x2.ll +BISON_SRC=parse.yy +FLEX_SRC=lex.ll + +OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(STDLIB_SRC:.ll=.o) stdlib-c.o stdlib_ispc.o \ + $(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o)) + +default: ispc ispc_test + +.PHONY: dirs clean depend doxygen +.PRECIOUS: objs/stdlib-%.cpp + +depend: $(CXX_SRC) $(HEADERS) + @echo Updating dependencies + @gcc -MM $(CXXFLAGS) $(CXX_SRC) | sed 's_^\([a-z]\)_objs/\1_g' > depend + +-include depend + +dirs: + @echo Creating objs/ directory + @/bin/mkdir -p objs + +clean: + /bin/rm -rf objs ispc ispc_test + +doxygen: + /bin/rm -rf docs/doxygen + doxygen doxygen.cfg + +ispc: dirs $(OBJS) + @echo Creating ispc executable + @$(CXX) $(LDFLAGS) -o $@ $(OBJS) $(LLVM_LIBS) + +ispc_test: dirs ispc_test.cpp + @echo Creating ispc_test executable + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ ispc_test.cpp $(LLVM_LIBS) + +objs/%.o: %.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + +objs/parse.cc: parse.yy + @echo Running bison on $< + @$(YACC) -o $@ $< + +objs/parse.o: objs/parse.cc $(HEADERS) + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + +objs/lex.cpp: lex.ll + @echo Running flex on $< + @$(LEX) -o $@ $< + +objs/lex.o: objs/lex.cpp $(HEADERS) + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + +$(STDLIB_SRC): stdlib.m4 + +objs/stdlib-%.cpp: stdlib-%.ll + @echo Creating C++ source from stdlib file $< + @m4 stdlib.m4 $< | ./bitcode2cpp.py $< > $@ + +objs/stdlib-%.o: objs/stdlib-%.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + +objs/stdlib-c.cpp: stdlib-c.c + @echo Creating C++ source from stdlib file $< + @$(CLANG) -I /opt/l1om/usr/include/ -emit-llvm -c $< -o - | llvm-dis - | ./bitcode2cpp.py $< > $@ + +objs/stdlib-c.o: objs/stdlib-c.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< + +objs/stdlib_ispc.cpp: stdlib.ispc + @echo Creating C++ source from $< + @$(CPP) -DISPC=1 -DPI=3.1415936535 $< | ./stdlib2cpp.py > $@ + +objs/stdlib_ispc.o: objs/stdlib_ispc.cpp + @echo Compiling $< + @$(CXX) $(CXXFLAGS) -o $@ -c $< diff --git a/READMErst.txt b/READMErst.txt new file mode 100644 index 00000000..def29dc8 --- /dev/null +++ b/READMErst.txt @@ -0,0 +1,22 @@ +============================== +Intel(r) SPMD Program Compiler +============================== + +Welcome to the Intel(r) SPMD Program Compiler (ispc)! + +ispc is a new compiler for "single program, multiple data" (SPMD) +programs. Under the SPMD model, the programmer writes a program that mostly +appears to be a regular serial program, though the execution model is +actually that a number of program instances execute in parallel on the +hardware. ispc compiles a C-based SPMD programming language to run on the +SIMD units of CPUs; it frequently provides a a 3x or more speedup on CPUs +with 4-wide SSE units, without any of the difficulty of writing intrinsics +code. + +ispc is an open source compiler under the BSD license; see the file +LICENSE.txt. ispc supports Windows, Mac, and Linux, with both x86 and +x86-64 targets. It currently supports the SSE2 and SSE4 instruction sets, +though support for AVX should be available soon. + +For more information and examples, as well as a wiki and the bug database, +see the ispc distribution site, http://ispc.github.com. diff --git a/bitcode2cpp.py b/bitcode2cpp.py new file mode 100755 index 00000000..b61f6f8e --- /dev/null +++ b/bitcode2cpp.py @@ -0,0 +1,34 @@ +#!/usr/bin/python + +import sys +import string +import re +import subprocess + +length=0 + +src=str(sys.argv[1]) + +target = re.sub(".*stdlib-", "", src) +target = re.sub("\.ll$", "", target) +target = re.sub("\.c$", "", target) +target = re.sub("-", "_", target) + +try: + as_out=subprocess.Popen([ "llvm-as", "-", "-o", "-"], stdout=subprocess.PIPE) +except IOError: + print >> sys.stderr, "Couldn't open " + src + sys.exit(1) + +print "unsigned char stdlib_bitcode_" + target + "[] = {" +for line in as_out.stdout.readlines(): + length = length + len(line) + for c in line: + print ord(c) + print ", " +print " 0 };\n\n" +print "int stdlib_bitcode_" + target + "_length = " + str(length) + ";\n" + +as_out.wait() + +sys.exit(as_out.returncode) diff --git a/builtins.cpp b/builtins.cpp new file mode 100644 index 00000000..d2a49c7e --- /dev/null +++ b/builtins.cpp @@ -0,0 +1,617 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file builtins.cpp + @brief Definitions of functions related to setting up the standard library + and other builtins. +*/ + +#include "builtins.h" +#include "type.h" +#include "util.h" +#include "sym.h" +#include "expr.h" +#include "llvmutil.h" +#include "module.h" +#include "ctx.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int yyparse(); +struct yy_buffer_state; +extern yy_buffer_state *yy_scan_string(const char *); + + +/** Given an LLVM type, try to find the equivalent ispc type. Note that + this is an under-constrained problem due to LLVM's type representations + carrying less information than ispc's. (For example, LLVM doesn't + distinguish between signed and unsigned integers in its types.) + + However, because this function is only used for generating ispc + declarations of functions defined in LLVM bitcode in the stdlib-*.ll + files, in practice we can get enough of what we need for the relevant + cases to make things work. + */ +static const Type * +lLLVMTypeToISPCType(const llvm::Type *t) { + if (t == LLVMTypes::VoidType) + return AtomicType::Void; + else if (t == LLVMTypes::BoolType) + return AtomicType::UniformBool; + else if (t == LLVMTypes::Int32Type) + return AtomicType::UniformInt32; + else if (t == LLVMTypes::FloatType) + return AtomicType::UniformFloat; + else if (t == LLVMTypes::DoubleType) + return AtomicType::UniformDouble; + else if (t == LLVMTypes::Int64Type) + return AtomicType::UniformInt64; + else if (t == LLVMTypes::Int32VectorType) + return AtomicType::VaryingInt32; + else if (t == LLVMTypes::FloatVectorType) + return AtomicType::VaryingFloat; + else if (t == LLVMTypes::DoubleVectorType) + return AtomicType::VaryingDouble; + else if (t == LLVMTypes::Int64VectorType) + return AtomicType::VaryingInt64; + else if (t == LLVMTypes::Int32PointerType) + return new ReferenceType(AtomicType::UniformInt32, false); + else if (t == LLVMTypes::FloatPointerType) + return new ReferenceType(AtomicType::UniformFloat, false); + else if (t == LLVMTypes::Int32VectorPointerType) + return new ReferenceType(AtomicType::VaryingInt32, false); + else if (t == LLVMTypes::FloatVectorPointerType) + return new ReferenceType(AtomicType::VaryingFloat, false); + else if (llvm::isa(t)) { + const llvm::PointerType *pt = llvm::dyn_cast(t); + + // Is it a pointer to an unsized array of objects? If so, then + // create the equivalent ispc type. Note that it has to be a + // reference to an array, since ispc passes arrays to functions by + // reference. + // + // FIXME: generalize this to do more than uniform int32s (that's + // all that's necessary for the stdlib currently.) + const llvm::ArrayType *at = + llvm::dyn_cast(pt->getElementType()); + if (at && at->getNumElements() == 0 && + at->getElementType() == LLVMTypes::Int32Type) + return new ReferenceType(new ArrayType(AtomicType::UniformInt32, 0), + false); + } + + return NULL; +} + + +/** Given an LLVM function declaration, synthesize the equivalent ispc + symbol for the function (if possible). Returns true on success, false + on failure. + */ +static bool +lCreateISPCSymbol(llvm::Function *func, SymbolTable *symbolTable) { + SourcePos noPos; + noPos.name = "__stdlib"; + + const llvm::FunctionType *ftype = func->getFunctionType(); + std::string name = func->getName(); + + const Type *returnType = lLLVMTypeToISPCType(ftype->getReturnType()); + if (!returnType) + // return type not representable in ispc -> not callable from ispc + return false; + + // Iterate over the arguments and try to find their equivalent ispc + // types. + std::vector argTypes; + for (unsigned int i = 0; i < ftype->getNumParams(); ++i) { + const llvm::Type *llvmArgType = ftype->getParamType(i); + const Type *type = lLLVMTypeToISPCType(llvmArgType); + if (type == NULL) + return false; + argTypes.push_back(type); + } + + FunctionType *funcType = new FunctionType(returnType, argTypes, noPos); + Symbol *sym = new Symbol(name, noPos, funcType); + sym->function = func; + symbolTable->AddFunction(sym); + return true; +} + + +/** Given an LLVM module, create ispc symbols for the functions in the + module. + */ +static void +lAddModuleSymbols(llvm::Module *module, SymbolTable *symbolTable) { +#if 0 + // FIXME: handle globals? + assert(module->global_empty()); +#endif + + llvm::Module::iterator iter; + for (iter = module->begin(); iter != module->end(); ++iter) { + llvm::Function *func = iter; + lCreateISPCSymbol(func, symbolTable); + } +} + +/** Declare the function symbol 'bool __is_compile_time_constant_mask(mask type)'. + This function will never be defined; it's just a placeholder + that will be handled during the optimization process. See the + discussion of the implementation of CompileTimeConstantResolvePass for + more details. + */ +static void +lDeclareCompileTimeConstant(llvm::Module *module) { + SourcePos noPos; + noPos.name = "__stdlib"; + + std::vector argTypes; + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::BoolType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__is_compile_time_constant_mask", module); + func->setOnlyReadsMemory(true); + func->setDoesNotThrow(true); +} + + +/** Declare the 'pseudo-gather' functions. When the ispc front-end needs + to perform a gather, it generates a call to one of these functions, + which have signatures: + + varying int32 __pseudo_gather(varying int32 *, mask) + varying int64 __pseudo_gather(varying int64 *, mask) + + These functions are never actually implemented; the + GatherScatterFlattenOpt optimization pass finds them and then converts + them to make calls to the following functions, which represent gathers + from a common base pointer with offsets. This approach allows the + front-end to be relatively simple in how it emits address calculation + for gathers. + + varying int32 __pseudo_gather_base_offsets_32(uniform int32 *base, + int32 offsets, mask) + varying int64 __pseudo_gather_base_offsets_64(uniform int64 *base, + int64 offsets, mask) + + Then, the GSImprovementsPass optimizations finds these and either + converts them to native gather functions or converts them to vector + loads, if equivalent. + */ +static void +lDeclarePseudoGathers(llvm::Module *module) { + SourcePos noPos; + noPos.name = "__stdlib"; + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerVectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_gather_32", module); + func->setOnlyReadsMemory(true); + func->setDoesNotThrow(true); + + fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false); + func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_gather_64", module); + func->setOnlyReadsMemory(true); + func->setDoesNotThrow(true); + } + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::Int32VectorType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_gather_base_offsets_32", module); + func->setOnlyReadsMemory(true); + func->setDoesNotThrow(true); + + fType = llvm::FunctionType::get(LLVMTypes::Int64VectorType, argTypes, false); + func = llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_gather_base_offsets_64", module); + func->setOnlyReadsMemory(true); + func->setDoesNotThrow(true); + } +} + + +/** Similarly to the 'pseudo-gathers' defined by lDeclarePseudoGathers(), + we also declare (but never define) pseudo-scatter instructions with + signatures: + + void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) + void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) + + The GatherScatterFlattenOpt optimization pass also finds these and + transforms them to scatters like: + + void __pseudo_scatter_base_offsets_32(uniform int32 *base, + varying int32 offsets, varying int32 values, mask) + void __pseudo_scatter_base_offsets_64(uniform int64 *base, + varying int62 offsets, varying int64 values, mask) + + And the GSImprovementsPass in turn converts these to actual native + scatters or masked stores. +*/ +static void +lDeclarePseudoScatters(llvm::Module *module) { + SourcePos noPos; + noPos.name = "__stdlib"; + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerVectorType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_scatter_32", module); + func->setDoesNotThrow(true); + } + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerVectorType); + argTypes.push_back(LLVMTypes::Int64VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_scatter_64", module); + func->setDoesNotThrow(true); + } + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_scatter_base_offsets_32", module); + func->setDoesNotThrow(true); + } + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::Int64VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_scatter_base_offsets_64", module); + func->setDoesNotThrow(true); + } +} + + +/** This function declares placeholder masked store functions for the + front-end to use. + + void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) + void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) + + These in turn are converted to native masked stores or to regular + stores (if the mask is all on) by the MaskedStoreOptPass optimization + pass. + */ +static void +lDeclarePseudoMaskedStore(llvm::Module *module) { + SourcePos noPos; + noPos.name = "__stdlib"; + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::Int32VectorPointerType); + argTypes.push_back(LLVMTypes::Int32VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_masked_store_32", module); + func->setDoesNotThrow(true); + func->addFnAttr(llvm::Attribute::AlwaysInline); + func->setDoesNotCapture(1, true); + } + + { + std::vector argTypes; + argTypes.push_back(LLVMTypes::Int64VectorPointerType); + argTypes.push_back(LLVMTypes::Int64VectorType); + argTypes.push_back(LLVMTypes::MaskType); + + llvm::FunctionType *fType = + llvm::FunctionType::get(LLVMTypes::VoidType, argTypes, false); + llvm::Function *func = + llvm::Function::Create(fType, llvm::GlobalValue::ExternalLinkage, + "__pseudo_masked_store_64", module); + func->setDoesNotThrow(true); + func->addFnAttr(llvm::Attribute::AlwaysInline); + func->setDoesNotCapture(1, true); + } +} + + +/** This utility function takes serialized binary LLVM bitcode and adds its + definitions to the given module. Functions in the bitcode that can be + mapped to ispc functions are also added to the symbol table. + + @param bitcode Binary LLVM bitcode (e.g. the contents of a *.bc file) + @param length Length of the bitcode buffer + @param module Module to link the bitcode into + @param symbolTable Symbol table to add definitions to + */ +static void +lAddBitcode(const unsigned char *bitcode, int length, + llvm::Module *module, SymbolTable *symbolTable) { + std::string bcErr; + llvm::StringRef sb = llvm::StringRef((char *)bitcode, length); + llvm::MemoryBuffer *bcBuf = llvm::MemoryBuffer::getMemBuffer(sb); + llvm::Module *bcModule = llvm::ParseBitcodeFile(bcBuf, *g->ctx, &bcErr); + if (!bcModule) + Error(SourcePos(), "Error parsing stdlib bitcode: %s", bcErr.c_str()); + else { + std::string(linkError); + if (llvm::Linker::LinkModules(module, bcModule, &linkError)) + Error(SourcePos(), "Error linking stdlib bitcode: %s", linkError.c_str()); + lAddModuleSymbols(module, symbolTable); + } +} + + +/** Utility routine that defines a constant int32 with given value, adding + the symbol to both the ispc symbol table and the given LLVM module. + */ +static void +lDefineConstantInt(const char *name, int val, llvm::Module *module, + SymbolTable *symbolTable) { + Symbol *pw = new Symbol(name, SourcePos(), AtomicType::UniformConstInt32); + pw->isStatic = true; + pw->constValue = new ConstExpr(pw->type, val, SourcePos()); + const llvm::Type *ltype = LLVMTypes::Int32Type; + llvm::Constant *linit = LLVMInt32(val); + pw->storagePtr = new llvm::GlobalVariable(*module, ltype, true, + llvm::GlobalValue::InternalLinkage, + linit, pw->name.c_str()); + symbolTable->AddVariable(pw); +} + + +static void +lDefineProgramIndex(llvm::Module *module, SymbolTable *symbolTable) { + Symbol *pidx = new Symbol("programIndex", SourcePos(), + AtomicType::VaryingConstInt32); + pidx->isStatic = true; + + int pi[ISPC_MAX_NVEC]; + for (int i = 0; i < g->target.vectorWidth; ++i) + pi[i] = i; + pidx->constValue = new ConstExpr(pidx->type, pi, SourcePos()); + + const llvm::Type *ltype = LLVMTypes::Int32VectorType; + llvm::Constant *linit = LLVMInt32Vector(pi); + pidx->storagePtr = new llvm::GlobalVariable(*module, ltype, true, + llvm::GlobalValue::InternalLinkage, linit, + pidx->name.c_str()); + symbolTable->AddVariable(pidx); +} + + +void +DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module, + bool includeStdlibISPC) { + // Add the definitions from the compiled stdlib-c.c file + extern unsigned char stdlib_bitcode_c[]; + extern int stdlib_bitcode_c_length; + lAddBitcode(stdlib_bitcode_c, stdlib_bitcode_c_length, module, symbolTable); + + // Next, add the target's custom implementations of the various needed + // builtin functions (e.g. __masked_store_32(), etc). + switch (g->target.isa) { + case Target::SSE2: + extern unsigned char stdlib_bitcode_sse2[]; + extern int stdlib_bitcode_sse2_length; + lAddBitcode(stdlib_bitcode_sse2, stdlib_bitcode_sse2_length, module, + symbolTable); + break; + case Target::SSE4: + extern unsigned char stdlib_bitcode_sse4[]; + extern int stdlib_bitcode_sse4_length; + extern unsigned char stdlib_bitcode_sse4x2[]; + extern int stdlib_bitcode_sse4x2_length; + switch (g->target.vectorWidth) { + case 4: + lAddBitcode(stdlib_bitcode_sse4, stdlib_bitcode_sse4_length, + module, symbolTable); + break; + case 8: + lAddBitcode(stdlib_bitcode_sse4x2, stdlib_bitcode_sse4x2_length, + module, symbolTable); + break; + default: + FATAL("logic error in DefineStdlib"); + } + break; + case Target::AVX: + extern unsigned char stdlib_bitcode_avx[]; + extern int stdlib_bitcode_avx_length; + lAddBitcode(stdlib_bitcode_avx, stdlib_bitcode_avx_length, module, + symbolTable); + break; + default: + FATAL("logic error"); + } + + // Add a declaration of void *ISPCMalloc(int64_t). The user is + // responsible for linking in a definition of this if it's needed by + // the compiled program. + { std::vector argTypes; + argTypes.push_back(llvm::Type::getInt64Ty(*ctx)); + llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, + argTypes, false); + llvm::Function *func = + llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, + "ISPCMalloc", module); + func->setDoesNotThrow(true); + } + + // Add a declaration of void ISPCFree(void *). The user is + // responsible for linking in a definition of this if it's needed by + // the compiled program. + { std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerType); + llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidPointerType, + argTypes, false); + llvm::Function *func = + llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, + "ISPCFree", module); + func->setDoesNotThrow(true); + } + + // Add a declaration of void ISPCLaunch(void *funcPtr, void *data). + // The user is responsible for linking in a definition of this if it's + // needed by the compiled program. + { std::vector argTypes; + argTypes.push_back(LLVMTypes::VoidPointerType); + argTypes.push_back(LLVMTypes::VoidPointerType); + llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, + argTypes, false); + llvm::Function *func = + llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, + "ISPCLaunch", module); + func->setDoesNotThrow(true); + } + + // Add a declaration of void ISPCSync(). The user is responsible for + // linking in a definition of this if it's needed by the compiled + // program. + { + std::vector argTypes; + llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, + argTypes, false); + llvm::Function *func = + llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, + "ISPCSync", module); + func->setDoesNotThrow(true); + } + + // Add a declaration of void ISPCInstrument(void *, void *, int, int). + // The user is responsible for linking in a definition of this if it's + // needed by the compiled program. + { + std::vector argTypes; + argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0)); + argTypes.push_back(llvm::PointerType::get(llvm::Type::getInt8Ty(*g->ctx), 0)); + argTypes.push_back(LLVMTypes::Int32Type); + argTypes.push_back(LLVMTypes::Int32Type); + llvm::FunctionType *ftype = llvm::FunctionType::get(LLVMTypes::VoidType, + argTypes, false); + llvm::Function *func = + llvm::Function::Create(ftype, llvm::GlobalValue::ExternalLinkage, + "ISPCInstrument", module); + func->setDoesNotThrow(true); + } + + // Declare various placeholder functions that the optimizer will later + // find and replace with something more useful. + lDeclareCompileTimeConstant(module); + lDeclarePseudoGathers(module); + lDeclarePseudoScatters(module); + lDeclarePseudoMaskedStore(module); + + // define the 'programCount' builtin variable + lDefineConstantInt("programCount", g->target.vectorWidth, module, symbolTable); + + // define the 'programIndex' builtin + lDefineProgramIndex(module, symbolTable); + + // Define __math_lib stuff. This is used by stdlib.ispc, for example, to + // figure out which math routines to end up calling... + lDefineConstantInt("__math_lib", (int)g->mathLib, module, symbolTable); + lDefineConstantInt("__math_lib_ispc", (int)Globals::Math_ISPC, module, + symbolTable); + lDefineConstantInt("__math_lib_ispc_fast", (int)Globals::Math_ISPCFast, + module, symbolTable); + lDefineConstantInt("__math_lib_svml", (int)Globals::Math_SVML, module, + symbolTable); + lDefineConstantInt("__math_lib_system", (int)Globals::Math_System, module, + symbolTable); + + if (includeStdlibISPC) { + // If the user wants the standard library to be included, parse the + // serialized version of the stdlib.ispc file to get its definitions + // added. + extern const char *stdlib_code; + yy_scan_string(stdlib_code); + yyparse(); + } +} diff --git a/builtins.h b/builtins.h new file mode 100644 index 00000000..485cc369 --- /dev/null +++ b/builtins.h @@ -0,0 +1,58 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file builtins.h + @brief Declarations of functions related to builtins and the + standard library +*/ + +#ifndef ISPC_STDLIB_H +#define ISPC_STDLIB_H 1 + +#include "ispc.h" + +/** Adds declarations and definitions of ispc standard library functions + and types to the given module. + + @param symbolTable SymbolTable in which to add symbol definitions for + stdlib stuff + @param ctx llvm::LLVMContext to use for getting types and the + like for standard library definitions + @param module Module in which to add the declarations/definitions + @param includeStdlib Indicates whether the definitions from the stdlib.ispc + file should be added to the module. + */ +void DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *module, + bool includeStdlib); + +#endif // ISPC_STDLIB_H diff --git a/ctx.cpp b/ctx.cpp new file mode 100644 index 00000000..eb2d4d81 --- /dev/null +++ b/ctx.cpp @@ -0,0 +1,1903 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file ctx.cpp + @brief Implementation of the FunctionEmitContext class +*/ + +#include "ctx.h" +#include "util.h" +#include "llvmutil.h" +#include "type.h" +#include "stmt.h" +#include "expr.h" +#include "module.h" +#include "sym.h" +#include +#include +#include +#include +#include +#include + +/** This is a small utility structure that records information related to one + level of nested control flow. It's mostly used in correctly restoring + the mask and other state as we exit control flow nesting levels. +*/ +struct CFInfo { + /** Returns a new instance of the structure that represents entering an + 'if' statement */ + static CFInfo *GetIf(bool isUniform, llvm::Value *savedMask); + + /** Returns a new instance of the structure that represents entering a + loop. */ + static CFInfo *GetLoop(bool isUniform, llvm::BasicBlock *breakTarget, + llvm::BasicBlock *continueTarget, + llvm::Value *savedBreakLanesPtr, + llvm::Value *savedContinueLanesPtr, + llvm::Value *savedMask, llvm::Value *savedLoopMask); + + bool IsIf() { return type == If; } + bool IsLoop() { return type == Loop; } + bool IsVaryingType() { return !isUniform; } + bool IsUniform() { return isUniform; } + + enum CFType { If, Loop }; + CFType type; + bool isUniform; + llvm::BasicBlock *savedBreakTarget, *savedContinueTarget; + llvm::Value *savedBreakLanesPtr, *savedContinueLanesPtr; + llvm::Value *savedMask, *savedLoopMask; + +private: + CFInfo(CFType t, bool uniformIf, llvm::Value *sm) { + assert(t == If); + type = t; + isUniform = uniformIf; + savedBreakTarget = savedContinueTarget = NULL; + savedBreakLanesPtr = savedContinueLanesPtr = NULL; + savedMask = savedLoopMask = sm; + } + CFInfo(CFType t, bool iu, llvm::BasicBlock *bt, llvm::BasicBlock *ct, + llvm::Value *sb, llvm::Value *sc, llvm::Value *sm, + llvm::Value *lm) { + assert(t == Loop); + type = t; + isUniform = iu; + savedBreakTarget = bt; + savedContinueTarget = ct; + savedBreakLanesPtr = sb; + savedContinueLanesPtr = sc; + savedMask = sm; + savedLoopMask = lm; + } +}; + + +CFInfo * +CFInfo::GetIf(bool isUniform, llvm::Value *savedMask) { + return new CFInfo(If, isUniform, savedMask); +} + + +CFInfo * +CFInfo::GetLoop(bool isUniform, llvm::BasicBlock *breakTarget, + llvm::BasicBlock *continueTarget, + llvm::Value *savedBreakLanesPtr, + llvm::Value *savedContinueLanesPtr, + llvm::Value *savedMask, llvm::Value *savedLoopMask) { + return new CFInfo(Loop, isUniform, breakTarget, continueTarget, + savedBreakLanesPtr, savedContinueLanesPtr, + savedMask, savedLoopMask); +} + +/////////////////////////////////////////////////////////////////////////// + +FunctionEmitContext::FunctionEmitContext(const Type *rt, llvm::Function *function, + Symbol *funSym, SourcePos firstStmtPos) { + /* Create a new basic block to store all of the allocas */ + allocaBlock = llvm::BasicBlock::Create(*g->ctx, "allocas", function, 0); + bblock = llvm::BasicBlock::Create(*g->ctx, "entry", function, 0); + /* But jump from it immediately into the real entry block */ + llvm::BranchInst::Create(bblock, allocaBlock); + + maskPtr = AllocaInst(LLVMTypes::MaskType, "mask_memory"); + StoreInst(LLVMMaskAllOn, maskPtr); + + funcStartPos = funSym->pos; + returnType = rt; + entryMask = NULL; + loopMask = NULL; + breakLanesPtr = continueLanesPtr = NULL; + breakTarget = continueTarget = NULL; + + returnedLanesPtr = AllocaInst(LLVMTypes::MaskType, "returned_lanes_memory"); + StoreInst(LLVMMaskAllOff, returnedLanesPtr); + + if (!returnType || returnType == AtomicType::Void) + returnValuePtr = NULL; + else { + const llvm::Type *ftype = returnType->LLVMType(g->ctx); + returnValuePtr = AllocaInst(ftype, "return_value_memory"); + // FIXME: don't do this store??? + StoreInst(llvm::Constant::getNullValue(ftype), returnValuePtr); + } + +#ifndef LLVM_2_8 + if (m->diBuilder) { + /* If debugging is enabled, tell the debug information emission + code about this new function */ + diFile = funcStartPos.GetDIFile(); + llvm::DIType retType = rt->GetDIType(diFile); + int flags = llvm::DIDescriptor::FlagPrototyped; // ?? + diFunction = m->diBuilder->createFunction(diFile, /* scope */ + function->getName(), // mangled + funSym->name, + diFile, + funcStartPos.first_line, + retType, + funSym->isStatic, + true, /* is definition */ + flags, + g->opt.level > 0, + function); + /* And start a scope representing the initial function scope */ + StartScope(); + } +#endif // LLVM_2_8 + + launchedTasks = false; + + // connect the funciton's mask memory to the __mask symbol + Symbol *maskSymbol = m->symbolTable->LookupVariable("__mask"); + assert(maskSymbol != NULL); + maskSymbol->storagePtr = maskPtr; + +#ifndef LLVM_2_8 + // add debugging info for __mask, programIndex, ... + if (m->diBuilder) { + maskSymbol->pos = funcStartPos; + EmitVariableDebugInfo(maskSymbol); + + llvm::DIFile file = funcStartPos.GetDIFile(); + Symbol *programIndexSymbol = m->symbolTable->LookupVariable("programIndex"); + assert(programIndexSymbol && programIndexSymbol->storagePtr); + m->diBuilder->createGlobalVariable(programIndexSymbol->name, + file, + funcStartPos.first_line, + programIndexSymbol->type->GetDIType(file), + true /* static */, + programIndexSymbol->storagePtr); + + Symbol *programCountSymbol = m->symbolTable->LookupVariable("programCount"); + assert(programCountSymbol); + m->diBuilder->createGlobalVariable(programCountSymbol->name, + file, + funcStartPos.first_line, + programCountSymbol->type->GetDIType(file), + true /* static */, + programCountSymbol->storagePtr); + } +#endif +} + + +FunctionEmitContext::~FunctionEmitContext() { + assert(controlFlowInfo.size() == 0); +#ifndef LLVM_2_8 + assert(debugScopes.size() == (m->diBuilder ? 1 : 0)); +#endif +} + + +llvm::BasicBlock * +FunctionEmitContext::GetCurrentBasicBlock() { + return bblock; +} + + +void +FunctionEmitContext::SetCurrentBasicBlock(llvm::BasicBlock *bb) { + bblock = bb; +} + + +llvm::Value * +FunctionEmitContext::GetMask() { + return LoadInst(maskPtr, NULL, "load_mask"); +} + + +void +FunctionEmitContext::SetEntryMask(llvm::Value *value) { + entryMask = value; + SetMask(value); +} + + +void +FunctionEmitContext::SetLoopMask(llvm::Value *value) { + loopMask = value; +} + + +void +FunctionEmitContext::SetMask(llvm::Value *value) { + StoreInst(value, maskPtr); +} + + +void +FunctionEmitContext::MaskAnd(llvm::Value *oldMask, llvm::Value *test) { + llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, + test, "oldMask&test"); + SetMask(mask); +} + + +void +FunctionEmitContext::MaskAndNot(llvm::Value *oldMask, llvm::Value *test) { + llvm::Value *notTest = BinaryOperator(llvm::Instruction::Xor, test, LLVMMaskAllOn, + "~test"); + llvm::Value *mask = BinaryOperator(llvm::Instruction::And, oldMask, notTest, + "oldMask&~test"); + SetMask(mask); +} + + +void +FunctionEmitContext::BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) { + assert(bblock != NULL); + llvm::Value *any = Any(GetMask()); + BranchInst(btrue, bfalse, any); + // It's illegal to add any additional instructions to the basic block + // now that it's terminated, so set bblock to NULL to be safe + bblock = NULL; +} + + +void +FunctionEmitContext::BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) { + assert(bblock != NULL); + llvm::Value *all = All(GetMask()); + BranchInst(btrue, bfalse, all); + // It's illegal to add any additional instructions to the basic block + // now that it's terminated, so set bblock to NULL to be safe + bblock = NULL; +} + + +void +FunctionEmitContext::BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse) { + assert(bblock != NULL); + // switch sense of true/false bblocks + BranchIfMaskAny(bfalse, btrue); + // It's illegal to add any additional instructions to the basic block + // now that it's terminated, so set bblock to NULL to be safe + bblock = NULL; +} + + +void +FunctionEmitContext::StartUniformIf(llvm::Value *oldMask) { + controlFlowInfo.push_back(CFInfo::GetIf(true, oldMask)); +} + + +void +FunctionEmitContext::StartVaryingIf(llvm::Value *oldMask) { + controlFlowInfo.push_back(CFInfo::GetIf(false, oldMask)); +} + + +void +FunctionEmitContext::EndIf() { + // Make sure we match up with a Start{Uniform,Varying}If(). + assert(controlFlowInfo.size() > 0 && controlFlowInfo.back()->IsIf()); + CFInfo *ci = controlFlowInfo.back(); + controlFlowInfo.pop_back(); + + // 'uniform' ifs don't change the mask so we only need to restore the + // mask going into the if for 'varying' if statements + if (!ci->IsUniform() && bblock != NULL) { + // We can't just restore the mask as it was going into the 'if' + // statement. First we have to take into account any program + // instances that have executed 'return' statements; the restored + // mask must be off for those lanes. + restoreMaskGivenReturns(ci->savedMask); + + // If the 'if' statement is inside a loop with a 'varying' + // consdition, we also need to account for any break or continue + // statements that executed inside the 'if' statmeent; we also must + // leave the lane masks for the program instances that ran those + // off after we restore the mask after the 'if'. The code below + // ends up being optimized out in the case that there were no break + // or continue statements (and breakLanesPtr and continueLanesPtr + // have their initial 'all off' values), so we don't need to check + // for that here. + if (breakLanesPtr != NULL) { + assert(continueLanesPtr != NULL); + + // newMask = (oldMask & ~(breakLanes | continueLanes)) + llvm::Value *oldMask = GetMask(); + llvm::Value *breakLanes = LoadInst(breakLanesPtr, NULL, + "break_lanes"); + llvm::Value *continueLanes = LoadInst(continueLanesPtr, NULL, + "continue_lanes"); + llvm::Value *breakOrContinueLanes = + BinaryOperator(llvm::Instruction::Or, breakLanes, continueLanes, + "break|continue_lanes"); + llvm::Value *notBreakOrContinue = NotOperator(breakOrContinueLanes, + "!(break|continue)_lanes"); + llvm::Value *newMask = + BinaryOperator(llvm::Instruction::And, oldMask, notBreakOrContinue, + "new_mask"); + SetMask(newMask); + } + } +} + + +void +FunctionEmitContext::StartLoop(llvm::BasicBlock *bt, llvm::BasicBlock *ct, + bool uniformCF, llvm::Value *oldMask) { + // Store the current values of various loop-related state so that we + // can restore it when we exit this loop. + controlFlowInfo.push_back(CFInfo::GetLoop(uniformCF, breakTarget, + continueTarget, breakLanesPtr, + continueLanesPtr, oldMask, loopMask)); + if (uniformCF) + // If the loop has a uniform condition, we don't need to track + // which lanes 'break' or 'continue'; all of the running ones go + // together, so we just jump + breakLanesPtr = continueLanesPtr = NULL; + else { + // For loops with varying conditions, allocate space to store masks + // that record which lanes have done these + continueLanesPtr = AllocaInst(LLVMTypes::MaskType, "continue_lanes_memory"); + StoreInst(LLVMMaskAllOff, continueLanesPtr); + breakLanesPtr = AllocaInst(LLVMTypes::MaskType, "break_lanes_memory"); + StoreInst(LLVMMaskAllOff, breakLanesPtr); + } + + breakTarget = bt; + continueTarget = ct; + loopMask = NULL; // this better be set by the loop! +} + + +void +FunctionEmitContext::EndLoop() { + assert(controlFlowInfo.size() && !controlFlowInfo.back()->IsIf()); + CFInfo *ci = controlFlowInfo.back(); + controlFlowInfo.pop_back(); + + // Restore the break/continue state information to what it was before + // we went into this loop. + breakTarget = ci->savedBreakTarget; + continueTarget = ci->savedContinueTarget; + breakLanesPtr = ci->savedBreakLanesPtr; + continueLanesPtr = ci->savedContinueLanesPtr; + loopMask = ci->savedLoopMask; + + if (!ci->IsUniform()) + // If the loop had a 'uniform' test, then it didn't make any + // changes to the mask so there's nothing to restore. If it had a + // varying test, we need to restore the mask to what it was going + // into the loop, but still leaving off any lanes that executed a + // 'return' statement. + restoreMaskGivenReturns(ci->savedMask); +} + + +void +FunctionEmitContext::restoreMaskGivenReturns(llvm::Value *oldMask) { + if (!bblock) + return; + + // Restore the mask to the given old mask, but leave off any lanes that + // executed a return statement. + // newMask = (oldMask & ~returnedLanes) + llvm::Value *returnedLanes = LoadInst(returnedLanesPtr, NULL, "returned_lanes"); + llvm::Value *notReturned = NotOperator(returnedLanes, "~returned_lanes"); + llvm::Value *newMask = BinaryOperator(llvm::Instruction::And, + oldMask, notReturned, "new_mask"); + SetMask(newMask); +} + + +void +FunctionEmitContext::Break(bool doCoherenceCheck) { + if (breakTarget == NULL) { + Error(currentPos, "\"break\" statement is illegal outside of for/while/do loops."); + return; + } + + // If all of the enclosing 'if' tests in the loop have uniform control + // flow or if we can tell that the mask is all on, then we can just + // jump to the break location. + if (ifsInLoopAllUniform() || GetMask() == LLVMMaskAllOn) { + BranchInst(breakTarget); + if (ifsInLoopAllUniform() && doCoherenceCheck) + Warning(currentPos, "Coherent break statement not necessary in fully uniform " + "control flow."); + // Set bblock to NULL since the jump has terminated the basic block + bblock = NULL; + } + else { + // Otherwise we need to update the mask of the lanes that have + // executed a 'break' statement: + // breakLanes = breakLanes | mask + assert(breakLanesPtr != NULL); + llvm::Value *mask = GetMask(); + llvm::Value *breakMask = LoadInst(breakLanesPtr, NULL, "break_mask"); + llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, + mask, breakMask, "mask|break_mask"); + StoreInst(newMask, breakLanesPtr); + + // Set the current mask to be all off, just in case there are any + // statements in the same scope after the 'break'. Most of time + // this will be optimized away since we'll likely end the scope of + // an 'if' statement and restore the mask then. + SetMask(LLVMMaskAllOff); + + if (doCoherenceCheck) + // If the user has indicated that this is a 'coherent' break + // statement, then check to see if the mask is all off. If so, + // we have to conservatively jump to the continueTarget, not + // the breakTarget, since part of the reason the mask is all + // off may be due to 'continue' statements that executed in the + // current loop iteration. + // FIXME: if the loop only has break statements and no + // continues, we can jump to breakTarget in that case. + jumpIfAllLoopLanesAreDone(continueTarget); + } +} + + +void +FunctionEmitContext::Continue(bool doCoherenceCheck) { + if (!continueTarget) { + Error(currentPos, "\"continue\" statement illegal outside of for/while/do loops."); + return; + } + + if (ifsInLoopAllUniform() || GetMask() == LLVMMaskAllOn) { + // Similarly to 'break' statements, we can immediately jump to the + // continue target if we're only in 'uniform' control flow within + // loop or if we can tell that the mask is all on. + AddInstrumentationPoint("continue: uniform CF, jumped"); + if (ifsInLoopAllUniform() && doCoherenceCheck) + Warning(currentPos, "Coherent continue statement not necessary in fully uniform " + "control flow."); + BranchInst(continueTarget); + bblock = NULL; + } + else { + // Otherwise update the stored value of which lanes have 'continue'd. + // continueLanes = continueLanes | mask + assert(continueLanesPtr); + llvm::Value *mask = GetMask(); + llvm::Value *continueMask = + LoadInst(continueLanesPtr, NULL, "continue_mask"); + llvm::Value *newMask = BinaryOperator(llvm::Instruction::Or, + mask, continueMask, "mask|continueMask"); + StoreInst(newMask, continueLanesPtr); + + // And set the current mask to be all off in case there are any + // statements in the same scope after the 'continue' + SetMask(LLVMMaskAllOff); + + if (doCoherenceCheck) + // If this is a 'coherent continue' statement, then emit the + // code to see if all of the lanes are now off due to + // breaks/continues and jump to the continue target if so. + jumpIfAllLoopLanesAreDone(continueTarget); + } +} + + +/** This function checks to see if all of the 'if' statements (if any) + between the current scope and the first enclosing loop have 'uniform' + tests. + */ +bool +FunctionEmitContext::ifsInLoopAllUniform() const { + assert(controlFlowInfo.size() > 0); + // Go backwards through controlFlowInfo, since we add new nested scopes + // to the back. Stop once we come to the first enclosing loop. + int i = controlFlowInfo.size() - 1; + while (i >= 0 && controlFlowInfo[i]->type != CFInfo::Loop) { + if (controlFlowInfo[i]->isUniform == false) + // Found a scope due to an 'if' statement with a varying test + return false; + --i; + } + assert(i >= 0); // else we didn't find a loop! + return true; +} + + +void +FunctionEmitContext::jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target) { + // Check to see if (returned lanes | continued lanes | break lanes) is + // equal to the value of mask at the start of the loop iteration. If + // so, everyone is done and we can jump to the given target + llvm::Value *returned = LoadInst(returnedLanesPtr, NULL, "returned_lanes"); + llvm::Value *continued = LoadInst(continueLanesPtr, NULL, "continue_lanes"); + llvm::Value *breaked = LoadInst(breakLanesPtr, NULL, "break_lanes"); + llvm::Value *returnedOrContinued = BinaryOperator(llvm::Instruction::Or, + returned, continued, + "returned|continued"); + llvm::Value *returnedOrContinuedOrBreaked = + BinaryOperator(llvm::Instruction::Or, returnedOrContinued, + breaked, "returned|continued"); + + // Do we match the mask at loop entry? + llvm::Value *allRCB = MasksAllEqual(returnedOrContinuedOrBreaked, loopMask); + llvm::BasicBlock *bAll = CreateBasicBlock("all_continued_or_breaked"); + llvm::BasicBlock *bNotAll = CreateBasicBlock("not_all_continued_or_breaked"); + BranchInst(bAll, bNotAll, allRCB); + + // If so, have an extra basic block along the way to add + // instrumentation, if the user asked for it. + bblock = bAll; + AddInstrumentationPoint("break/continue: all dynamically went"); + BranchInst(target); + + // And set the current basic block to a new one for future instructions + // for the path where we weren't able to jump + bblock = bNotAll; + AddInstrumentationPoint("break/continue: not all went"); +} + + +void +FunctionEmitContext::RestoreContinuedLanes() { + if (continueLanesPtr == NULL) + return; + + // mask = mask & continueFlags + llvm::Value *mask = GetMask(); + llvm::Value *continueMask = LoadInst(continueLanesPtr, NULL, "continue_mask"); + llvm::Value *orMask = BinaryOperator(llvm::Instruction::Or, + mask, continueMask, "mask|continue_mask"); + SetMask(orMask); + + // continueLanes = 0 + StoreInst(LLVMMaskAllOff, continueLanesPtr); +} + + +int +FunctionEmitContext::VaryingCFDepth() const { + int sum = 0; + for (unsigned int i = 0; i < controlFlowInfo.size(); ++i) + if (controlFlowInfo[i]->IsVaryingType()) + ++sum; + return sum; +} + + +void +FunctionEmitContext::CurrentLanesReturned(Expr *expr, bool doCoherenceCheck) { + if (returnType == AtomicType::Void) { + if (expr != NULL) + Error(expr->pos, "Can't return non-void type \"%s\" from void function.", + expr->GetType()->GetString().c_str()); + } + else { + if (expr == NULL) { + Error(funcStartPos, + "Must provide return value for return statement for non-void function."); + return; + } + + // Use a masked store to store the value of the expression in the + // return value memory; this preserves the return values from other + // lanes that may have executed return statements previously. + Expr *r = expr->TypeConv(returnType, "return statement"); + if (r != NULL) { + llvm::Value *retVal = r->GetValue(this); + StoreInst(retVal, returnValuePtr, GetMask(), returnType); + } + } + + if (VaryingCFDepth() == 0) { + // If there is only uniform control flow between us and the + // function entry, then it's guaranteed that all lanes are running, + // so we can just emit a true return instruction + AddInstrumentationPoint("return: uniform control flow"); + ReturnInst(); + } + else { + // Otherwise we update the returnedLanes value by ANDing it with + // the current lane mask. + llvm::Value *oldReturnedLanes = LoadInst(returnedLanesPtr, NULL, + "old_returned_lanes"); + llvm::Value *newReturnedLanes = BinaryOperator(llvm::Instruction::Or, + oldReturnedLanes, + GetMask(), "old_mask|returned_lanes"); + + // For 'coherent' return statements, emit code to check if all + // lanes have returned + if (doCoherenceCheck) { + // if newReturnedLanes == entryMask, get out of here! + llvm::Value *cmp = MasksAllEqual(entryMask, newReturnedLanes); + llvm::BasicBlock *bDoReturn = CreateBasicBlock("do_return"); + llvm::BasicBlock *bNoReturn = CreateBasicBlock("no_return"); + BranchInst(bDoReturn, bNoReturn, cmp); + + bblock = bDoReturn; + AddInstrumentationPoint("return: all lanes have returned"); + ReturnInst(); + + bblock = bNoReturn; + } + // Otherwise update returnedLanesPtr and turn off all of the lanes + // in the current mask so that any subsequent statements in the + // same scope after the return have no effect + StoreInst(newReturnedLanes, returnedLanesPtr); + AddInstrumentationPoint("return: some but not all lanes have returned"); + SetMask(LLVMMaskAllOff); + } +} + + +llvm::Value * +FunctionEmitContext::Any(llvm::Value *mask) { + llvm::Value *mmval = LaneMask(mask); + return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, mmval, + LLVMInt32(0), "any_mm_cmp"); +} + + +llvm::Value * +FunctionEmitContext::All(llvm::Value *mask) { + llvm::Value *mmval = LaneMask(mask); + return CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, mmval, + LLVMInt32((1<target.vectorWidth)-1), "all_mm_cmp"); +} + + +llvm::Value * +FunctionEmitContext::LaneMask(llvm::Value *v) { + // Call the target-dependent movmsk function to turn the vector mask + // into an i32 value + std::vector *mm = m->symbolTable->LookupFunction("__movmsk"); + assert(mm && mm->size() == 1); + llvm::Function *fmm = (*mm)[0]->function; + return CallInst(fmm, v, "val_movmsk"); +} + + +llvm::Value * +FunctionEmitContext::MasksAllEqual(llvm::Value *v1, llvm::Value *v2) { + // Compare the two masks to get a vector of i1s + llvm::Value *cmp = CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_EQ, + v1, v2, "v1==v2"); + // Turn that into a bool vector type (often i32s) + cmp = I1VecToBoolVec(cmp); + // And see if it's all on + return All(cmp); +} + + +llvm::Value * +FunctionEmitContext::GetStringPtr(const std::string &str) { + llvm::Constant *lstr = llvm::ConstantArray::get(*g->ctx, str); + llvm::GlobalValue::LinkageTypes linkage = llvm::GlobalValue::InternalLinkage; + llvm::Value *lstrPtr = new llvm::GlobalVariable(*m->module, lstr->getType(), + true /*isConst*/, + linkage, lstr, "__str"); + return new llvm::BitCastInst(lstrPtr, LLVMTypes::VoidPointerType, + "str_void_ptr", bblock); +} + + +llvm::BasicBlock * +FunctionEmitContext::CreateBasicBlock(const char *name) { + llvm::Function *function = bblock->getParent(); + return llvm::BasicBlock::Create(*g->ctx, name, function); +} + + +llvm::Value * +FunctionEmitContext::I1VecToBoolVec(llvm::Value *b) { + const llvm::ArrayType *at = llvm::dyn_cast(b->getType()); + if (at) { + // If we're given an array of vectors of i1s, then do the + // conversion for each of the elements + const llvm::Type *boolArrayType = + llvm::ArrayType::get(LLVMTypes::BoolVectorType, at->getNumElements()); + llvm::Value *ret = llvm::UndefValue::get(boolArrayType); + + for (unsigned int i = 0; i < at->getNumElements(); ++i) { + llvm::Value *elt = ExtractInst(b, i); + llvm::Value *sext = SExtInst(elt, LLVMTypes::BoolVectorType, + "val_to_boolvec32"); + ret = InsertInst(ret, sext, i); + } + return ret; + } + else + return SExtInst(b, LLVMTypes::BoolVectorType, "val_to_boolvec32"); +} + + +llvm::Value * +FunctionEmitContext::EmitMalloc(const llvm::Type *ty) { + // Emit code to compute the size of the given type using a GEP with a + // NULL base pointer, indexing one element of the given type, and + // casting the resulting 'pointer' to an int giving its size. + const llvm::Type *ptrType = llvm::PointerType::get(ty, 0); + llvm::Value *nullPtr = llvm::Constant::getNullValue(ptrType); + llvm::Value *index[1] = { LLVMInt32(1) }; + llvm::Value *poffset = llvm::GetElementPtrInst::Create(nullPtr, &index[0], &index[1], + "offset_ptr", bblock); + AddDebugPos(poffset); + llvm::Value *sizeOf = PtrToIntInst(poffset, LLVMTypes::Int64Type, "offset_int"); + + // And given the size, call the malloc function + llvm::Function *fmalloc = m->module->getFunction("ISPCMalloc"); + assert(fmalloc != NULL); + llvm::Value *mem = CallInst(fmalloc, sizeOf, "raw_argmem"); + // Cast the void * back to the result pointer type + return BitCastInst(mem, ptrType, "mem_bitcast"); +} + + +void +FunctionEmitContext::EmitFree(llvm::Value *ptr) { + llvm::Value *freeArg = BitCastInst(ptr, LLVMTypes::VoidPointerType, + "argmemfree"); + llvm::Function *ffree = m->module->getFunction("ISPCFree"); + assert(ffree != NULL); + CallInst(ffree, freeArg); +} + + +static llvm::Value * +lGetStringAsValue(llvm::BasicBlock *bblock, const char *s) { + llvm::Constant *sConstant = llvm::ConstantArray::get(*g->ctx, s); + llvm::Value *sPtr = new llvm::GlobalVariable(*m->module, sConstant->getType(), + true /* const */, + llvm::GlobalValue::InternalLinkage, + sConstant, s); + llvm::Value *indices[2] = { LLVMInt32(0), LLVMInt32(0) }; + return llvm::GetElementPtrInst::Create(sPtr, &indices[0], &indices[2], + "sptr", bblock); +} + + +void +FunctionEmitContext::AddInstrumentationPoint(const char *note) { + assert(note != NULL); + if (!g->emitInstrumentation) + return; + + std::vector args; + // arg 1: filename as string + args.push_back(lGetStringAsValue(bblock, currentPos.name)); + // arg 2: provided note + args.push_back(lGetStringAsValue(bblock, note)); + // arg 3: line number + args.push_back(LLVMInt32(currentPos.first_line)); + // arg 4: current mask, movmsk'ed down to an int32 + args.push_back(LaneMask(GetMask())); + + llvm::Function *finst = m->module->getFunction("ISPCInstrument"); + CallInst(finst, args, ""); +} + + +void +FunctionEmitContext::SetDebugPos(SourcePos pos) { + currentPos = pos; +} + + +SourcePos +FunctionEmitContext::GetDebugPos() const { + return currentPos; +} + + +void +FunctionEmitContext::AddDebugPos(llvm::Value *value, const SourcePos *pos, + llvm::DIScope *scope) { +#ifndef LLVM_2_8 + llvm::Instruction *inst = llvm::dyn_cast(value); + if (inst != NULL && m->diBuilder) { + SourcePos p = pos ? *pos : currentPos; + if (p.first_line != 0) + // If first_line == 0, then we're in the middle of setting up + // the standard library or the like; don't add debug positions + // for those functions + inst->setDebugLoc(llvm::DebugLoc::get(p.first_line, p.first_column, + scope ? *scope : GetDIScope())); + } +#endif +} + + +void +FunctionEmitContext::StartScope() { +#ifndef LLVM_2_8 + if (m->diBuilder != NULL) { + llvm::DIScope parentScope; + if (debugScopes.size() > 0) + parentScope = debugScopes.back(); + else + parentScope = diFunction; + + llvm::DILexicalBlock lexicalBlock = + m->diBuilder->createLexicalBlock(parentScope, diFile, + currentPos.first_line, + currentPos.first_column); + debugScopes.push_back(lexicalBlock); + } +#endif +} + + +void +FunctionEmitContext::EndScope() { +#ifndef LLVM_2_8 + if (m->diBuilder != NULL) { + assert(debugScopes.size() > 0); + debugScopes.pop_back(); + } +#endif +} + + +llvm::DIScope +FunctionEmitContext::GetDIScope() const { + assert(debugScopes.size() > 0); + return debugScopes.back(); +} + + +void +FunctionEmitContext::EmitVariableDebugInfo(Symbol *sym) { +#ifndef LLVM_2_8 + if (m->diBuilder == NULL) + return; + + llvm::DIScope scope = GetDIScope(); + llvm::DIVariable var = + m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_auto_variable, + scope, + sym->name, + sym->pos.GetDIFile(), + sym->pos.first_line, + sym->type->GetDIType(scope), + true /* preserve through opts */); + llvm::Instruction *declareInst = + m->diBuilder->insertDeclare(sym->storagePtr, var, bblock); + AddDebugPos(declareInst, &sym->pos, &scope); +#endif +} + + +void +FunctionEmitContext::EmitFunctionParameterDebugInfo(Symbol *sym) { +#ifndef LLVM_2_8 + if (m->diBuilder == NULL) + return; + + llvm::DIScope scope = diFunction; + llvm::DIVariable var = + m->diBuilder->createLocalVariable(llvm::dwarf::DW_TAG_arg_variable, + scope, + sym->name, + sym->pos.GetDIFile(), + sym->pos.first_line, + sym->type->GetDIType(scope), + true /* preserve through opts */); + llvm::Instruction *declareInst = + m->diBuilder->insertDeclare(sym->storagePtr, var, bblock); + AddDebugPos(declareInst, &sym->pos, &scope); +#endif +} + + +/** If the given type is an array of vector types, then it's the + representation of an ispc VectorType with varying elements. If it is + one of these, return the array size (i.e. the VectorType's size). + Otherwise return zero. + */ +static int +lArrayVectorWidth(const llvm::Type *t) { + const llvm::ArrayType *arrayType = llvm::dyn_cast(t); + if (arrayType == NULL) + return 0; + + // We shouldn't be seeing arrays of anything but vectors being passed + // to things like FunctionEmitContext::BinaryOperator() as operands + const llvm::VectorType *vectorElementType = + llvm::dyn_cast(arrayType->getElementType()); + assert(vectorElementType != NULL && + (int)vectorElementType->getNumElements() == g->target.vectorWidth); + return (int)arrayType->getNumElements(); +} + + +llvm::Value * +FunctionEmitContext::BinaryOperator(llvm::Instruction::BinaryOps inst, + llvm::Value *v0, llvm::Value *v1, + const char *name) { + if (v0 == NULL || v1 == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + assert(v0->getType() == v1->getType()); + const llvm::Type *type = v0->getType(); + int arraySize = lArrayVectorWidth(type); + if (arraySize == 0) { + llvm::Instruction *bop = + llvm::BinaryOperator::Create(inst, v0, v1, name ? name : "", bblock); + AddDebugPos(bop); + return bop; + } + else { + // If this is an ispc VectorType, apply the binary operator to each + // of the elements of the array (which in turn should be either + // scalar types or llvm::VectorTypes.) + llvm::Value *ret = llvm::UndefValue::get(type); + for (int i = 0; i < arraySize; ++i) { + llvm::Value *a = ExtractInst(v0, i); + llvm::Value *b = ExtractInst(v1, i); + llvm::Value *op = BinaryOperator(inst, a, b); + ret = InsertInst(ret, op, i); + } + return ret; + } +} + + +llvm::Value * +FunctionEmitContext::NotOperator(llvm::Value *v, const char *name) { + if (v == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // Similarly to BinaryOperator, do the operation on all the elements of + // the array if we're given an array type; otherwise just do the + // regular llvm operation. + const llvm::Type *type = v->getType(); + int arraySize = lArrayVectorWidth(type); + if (arraySize == 0) { + llvm::Instruction *binst = + llvm::BinaryOperator::CreateNot(v, name ? name : "not", bblock); + AddDebugPos(binst); + return binst; + } + else { + llvm::Value *ret = llvm::UndefValue::get(type); + for (int i = 0; i < arraySize; ++i) { + llvm::Value *a = ExtractInst(v, i); + llvm::Value *op = + llvm::BinaryOperator::CreateNot(a, name ? name : "not", bblock); + AddDebugPos(op); + ret = InsertInst(ret, op, i); + } + return ret; + } +} + + +// Given the llvm Type that represents an ispc VectorType, return an +// equally-shaped type with boolean elements. (This is the type that will +// be returned from CmpInst with ispc VectorTypes). +static const llvm::Type * +lGetMatchingBoolVectorType(const llvm::Type *type) { + const llvm::ArrayType *arrayType = + llvm::dyn_cast(type); + // should only be called for vector typed stuff... + assert(arrayType != NULL); + + const llvm::VectorType *vectorElementType = + llvm::dyn_cast(arrayType->getElementType()); + assert(vectorElementType != NULL && + (int)vectorElementType->getNumElements() == g->target.vectorWidth); + + const llvm::Type *base = llvm::VectorType::get(LLVMTypes::BoolType, + g->target.vectorWidth); + return llvm::ArrayType::get(base, arrayType->getNumElements()); +} + + +llvm::Value * +FunctionEmitContext::CmpInst(llvm::Instruction::OtherOps inst, + llvm::CmpInst::Predicate pred, + llvm::Value *v0, llvm::Value *v1, + const char *name) { + if (v0 == NULL || v1 == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + assert(v0->getType() == v1->getType()); + const llvm::Type *type = v0->getType(); + int arraySize = lArrayVectorWidth(type); + if (arraySize == 0) { + llvm::Instruction *ci = + llvm::CmpInst::Create(inst, pred, v0, v1, name ? name : "cmp", + bblock); + AddDebugPos(ci); + return ci; + } + else { + const llvm::Type *boolType = lGetMatchingBoolVectorType(type); + llvm::Value *ret = llvm::UndefValue::get(boolType); + for (int i = 0; i < arraySize; ++i) { + llvm::Value *a = ExtractInst(v0, i); + llvm::Value *b = ExtractInst(v1, i); + llvm::Value *op = CmpInst(inst, pred, a, b, name); + ret = InsertInst(ret, op, i); + } + return ret; + } +} + + +llvm::Value * +FunctionEmitContext::BitCastInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + const llvm::Type *valType = value->getType(); + const llvm::ArrayType *at = llvm::dyn_cast(valType); + if (at && llvm::isa(at->getElementType())) { + // If we're bitcasting an array of pointers, we have a varying + // lvalue; apply the corresponding bitcast to each of the + // individual pointers and return the result array. + assert((int)at->getNumElements() == g->target.vectorWidth); + + llvm::Value *ret = + llvm::UndefValue::get(llvm::ArrayType::get(type, g->target.vectorWidth)); + for (int i = 0; i < g->target.vectorWidth; ++i) { + llvm::Value *elt = ExtractInst(value, i); + llvm::Value *bc = BitCastInst(elt, type, name); + ret = InsertInst(ret, bc, i); + } + return ret; + } + else { + llvm::Instruction *inst = + new llvm::BitCastInst(value, type, name ? name : "bitcast", bblock); + AddDebugPos(inst); + return inst; + } +} + + +llvm::Instruction * +FunctionEmitContext::PtrToIntInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + new llvm::PtrToIntInst(value, type, name ? name : "ptr2int", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::IntToPtrInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + new llvm::IntToPtrInst(value, type, name ? name : "int2ptr", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::TruncInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + new llvm::TruncInst(value, type, name ? name : "trunc", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::CastInst(llvm::Instruction::CastOps op, llvm::Value *value, + const llvm::Type *type, const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + llvm::CastInst::Create(op, value, type, name ? name : "cast", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::FPCastInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + llvm::CastInst::CreateFPCast(value, type, name ? name : "fpcast", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::SExtInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + new llvm::SExtInst(value, type, name ? name : "sext", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::ZExtInst(llvm::Value *value, const llvm::Type *type, + const char *name) { + if (value == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // TODO: we should probably handle the array case as in + // e.g. BitCastInst(), but we don't currently need that functionality + llvm::Instruction *inst = + new llvm::ZExtInst(value, type, name ? name : "zext", bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Value * +FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, + llvm::Value *index1, const char *name) { + if (basePtr == NULL || index0 == NULL || index1 == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + // FIXME: do we need need to handle the case of the first index being + // varying? It's currently needed... + assert(!llvm::isa(index0->getType())); + + const llvm::Type *basePtrType = basePtr->getType(); + const llvm::ArrayType *baseArrayType = + llvm::dyn_cast(basePtrType); + bool baseIsVaryingTypePointer = (baseArrayType != NULL) && + llvm::isa(baseArrayType->getElementType()); + bool indexIsVaryingType = llvm::isa(index1->getType()); + + if (!indexIsVaryingType && !baseIsVaryingTypePointer) { + // The easy case: both the base pointer and the indices are + // uniform, so just emit the regular LLVM GEP instruction + llvm::Value *indices[2] = { index0, index1 }; + llvm::Instruction *inst = + llvm::GetElementPtrInst::Create(basePtr, &indices[0], &indices[2], + name ? name : "gep", bblock); + AddDebugPos(inst); + return inst; + } + else { + // We have a varying pointer and/or indices; emit the appropriate + // GEP for each of the program instances + llvm::Value *lret = NULL; + for (int i = 0; i < g->target.vectorWidth; ++i) { + // Get the index, either using the same one if it's uniform or + // the one for this lane if it's varying + llvm::Value *indexElt; + if (indexIsVaryingType) + indexElt = ExtractInst(index1, i, "get_array_index"); + else + indexElt = index1; + + // Similarly figure out the appropriate base pointer + llvm::Value *aptr; + if (baseIsVaryingTypePointer) + aptr = ExtractInst(basePtr, i, "get_array_index"); + else + aptr = basePtr; + + // Do the GEP for this lane + llvm::Value *eltPtr = GetElementPtrInst(aptr, index0, indexElt, name); + + if (lret == NULL) { + // This is kind of a hack: use the type from the GEP to + // figure out the return type and the first time through, + // create an undef value of that type here + const llvm::PointerType *elementPtrType = + llvm::dyn_cast(eltPtr->getType()); + const llvm::Type *elementType = elementPtrType->getElementType(); + lret = llvm::UndefValue::get(LLVMPointerVectorType(elementType)); + } + + // And insert the result of the GEP into the return value + lret = InsertInst(lret, eltPtr, i, "elt_ptr_store"); + } + return lret; + } +} + + +llvm::Value * +FunctionEmitContext::GetElementPtrInst(llvm::Value *basePtr, int v0, int v1, + const char *name) { + return GetElementPtrInst(basePtr, LLVMInt32(v0), LLVMInt32(v1), name); +} + + +llvm::Value * +FunctionEmitContext::LoadInst(llvm::Value *lvalue, const Type *type, + const char *name) { + if (lvalue == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + if (llvm::isa(lvalue->getType())) { + // If the lvalue is a straight up regular pointer, then just issue + // a regular load + llvm::Instruction *inst = new llvm::LoadInst(lvalue, name ? name : "load", bblock); + AddDebugPos(inst); + return inst; + } + else { + // Otherwise we should have a varying lvalue and it's time for a + // gather. The "type" parameter only has to be non-NULL for the + // gather path here (we can't reliably figure out all of the type + // information we need from the LLVM::Type, so have to carry the + // ispc type in through this path.. + assert(type != NULL); + assert(llvm::isa(lvalue->getType())); + return gather(lvalue, type, name); + } +} + + +llvm::Value * +FunctionEmitContext::gather(llvm::Value *lvalue, const Type *type, + const char *name) { + // We should have a varying lvalue if we get here... + assert(llvm::dyn_cast(lvalue->getType())); + + const llvm::Type *retType = type->LLVMType(g->ctx); + + const StructType *st = dynamic_cast(type); + if (st) { + // If we're gathering structures, do an element-wise gather + // recursively. + llvm::Value *retValue = llvm::UndefValue::get(retType); + for (int i = 0; i < st->NumElements(); ++i) { + llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i); + // This in turn will be another gather + llvm::Value *eltValues = LoadInst(eltPtrs, st->GetMemberType(i), + name); + retValue = InsertInst(retValue, eltValues, i, "set_value"); + } + return retValue; + } + + const VectorType *vt = dynamic_cast(type); + if (vt) { + // Similarly, if it's a vector type, do a gather for each of the + // vector elements + llvm::Value *retValue = llvm::UndefValue::get(retType); + // FIXME: yuck. Change lvalues to be pointers to arrays so that + // the GEP stuff in the loop below ends up computing pointers based + // on elements in the vectors rather than incorrectly advancing to + // the next vector... + const llvm::Type *eltType = + vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx); + lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0)); + + for (int i = 0; i < vt->GetElementCount(); ++i) { + llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i); + llvm::Value *eltValues = LoadInst(eltPtrs, vt->GetBaseType(), name); + retValue = InsertInst(retValue, eltValues, i, "set_value"); + } + return retValue; + } + + const ArrayType *at = dynamic_cast(type); + if (at) { + // Arrays are also handled recursively and element-wise + llvm::Value *retValue = llvm::UndefValue::get(retType); + for (int i = 0; i < at->GetElementCount(); ++i) { + llvm::Value *eltPtrs = GetElementPtrInst(lvalue, 0, i); + llvm::Value *eltValues = LoadInst(eltPtrs, at->GetElementType(), name); + retValue = InsertInst(retValue, eltValues, i, "set_value"); + } + return retValue; + } + + // Otherwise we should just have a basic scalar type and we can go and + // do the actual gather + AddInstrumentationPoint("gather"); + + llvm::Value *mask = GetMask(); + llvm::Function *gather = NULL; + // Figure out which gather function to call based on the size of + // the elements; will need to generalize this for 8 and 16-bit + // types. + if (retType == LLVMTypes::DoubleVectorType || + retType == LLVMTypes::Int64VectorType) + gather = m->module->getFunction("__pseudo_gather_64"); + else { + assert(retType == LLVMTypes::FloatVectorType || + retType == LLVMTypes::Int32VectorType); + gather = m->module->getFunction("__pseudo_gather_32"); + } + assert(gather); + + llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType); + llvm::Instruction *call = CallInst(gather, voidlvalue, mask, name); + // Add metadata about the source file location so that the + // optimization passes can print useful performance warnings if we + // can't optimize out this gather + addGSMetadata(call, currentPos); + + llvm::Value *val = BitCastInst(call, retType, "gather_bitcast"); + + return val; +} + + +/** Add metadata to the given instruction to encode the current source file + position. This data is used in the lGetSourcePosFromMetadata() + function in opt.cpp. +*/ +void +FunctionEmitContext::addGSMetadata(llvm::Instruction *inst, SourcePos pos) { + llvm::Value *str = llvm::MDString::get(*g->ctx, pos.name); +#ifdef LLVM_2_8 + llvm::MDNode *md = llvm::MDNode::get(*g->ctx, &str, 1); +#else + llvm::MDNode *md = llvm::MDNode::get(*g->ctx, str); +#endif + inst->setMetadata("filename", md); + + llvm::Value *line = LLVMInt32(pos.first_line); +#ifdef LLVM_2_8 + md = llvm::MDNode::get(*g->ctx, &first_line, 1); +#else + md = llvm::MDNode::get(*g->ctx, line); +#endif + inst->setMetadata("line", md); + + llvm::Value *column = LLVMInt32(pos.first_column); +#ifdef LLVM_2_8 + md = llvm::MDNode::get(*g->ctx, &first_column, 1); +#else + md = llvm::MDNode::get(*g->ctx, column); +#endif + inst->setMetadata("column", md); +} + + +llvm::Value * +FunctionEmitContext::AllocaInst(const llvm::Type *llvmType, const char *name, + int align, bool atEntryBlock) { + llvm::AllocaInst *inst = NULL; + if (atEntryBlock) { + // We usually insert it right before the jump instruction at the + // end of allocaBlock + llvm::Instruction *retInst = allocaBlock->getTerminator(); + assert(retInst); + inst = new llvm::AllocaInst(llvmType, name ? name : "", retInst); + } + else + // Unless the caller overrode the default and wants it in the + // current basic block + inst = new llvm::AllocaInst(llvmType, name ? name : "", bblock); + + if (align != 0) + inst->setAlignment(align); + // Don't add debugging info to alloca instructions + return inst; +} + + +/** Code to store the given varying value to the given location, only + storing the elements that correspond to active program instances as + given by the provided storeMask value. Note that the lvalue is only a + single pointer, not a varying lvalue of one pointer per program + instance (that case is handled by scatters). + */ +void +FunctionEmitContext::maskedStore(llvm::Value *rvalue, llvm::Value *lvalue, + const Type *rvalueType, + llvm::Value *storeMask) { + if (rvalue == NULL || lvalue == NULL) { + assert(m->errorCount > 0); + return; + } + + assert(llvm::isa(lvalue->getType())); + + const StructType *structType = dynamic_cast(rvalueType); + if (structType != NULL) { + // Assigning a structure + for (int i = 0; i < structType->NumElements(); ++i) { + llvm::Value *eltValue = ExtractInst(rvalue, i, "rvalue_member"); + llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, + "struct_lvalue_ptr"); + StoreInst(eltValue, eltLValue, storeMask, + structType->GetMemberType(i)); + } + return; + } + + const SequentialType *sequentialType = + dynamic_cast(rvalueType); + if (sequentialType != NULL) { + // Assigning arrays and vectors. Handle each element individually + // with what turns into a recursive call to makedStore() + for (int i = 0; i < sequentialType->GetElementCount(); ++i) { + llvm::Value *eltLValue = GetElementPtrInst(lvalue, 0, i, "lval_i_ptr"); + llvm::Value *eltValue = ExtractInst(rvalue, i, "array_i_val"); + StoreInst(eltValue, eltLValue, storeMask, + sequentialType->GetElementType()); + } + return; + } + + // We must have a regular atomic type at this point + assert(dynamic_cast(rvalueType) != NULL); + rvalueType = rvalueType->GetAsNonConstType(); + + llvm::Function *maskedStoreFunc = NULL; + // Figure out if we need a 32-bit or 64-bit masked store. This + // will need to be generalized when/if 8 and 16-bit data types are + // added. + if (rvalueType == AtomicType::VaryingDouble || + rvalueType == AtomicType::VaryingInt64 || + rvalueType == AtomicType::VaryingUInt64) { + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_64"); + lvalue = BitCastInst(lvalue, LLVMTypes::Int64VectorPointerType, + "lvalue_to_int64vecptr"); + rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, + "rvalue_to_int64"); + } + else { + assert(rvalueType == AtomicType::VaryingFloat || + rvalueType == AtomicType::VaryingBool || + rvalueType == AtomicType::VaryingInt32 || + rvalueType == AtomicType::VaryingUInt32); + + maskedStoreFunc = m->module->getFunction("__pseudo_masked_store_32"); + lvalue = BitCastInst(lvalue, LLVMTypes::Int32VectorPointerType, + "lvalue_to_int32vecptr"); + if (rvalueType == AtomicType::VaryingFloat) + rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, + "rvalue_to_int32"); + } + + std::vector args; + args.push_back(lvalue); + args.push_back(rvalue); + args.push_back(storeMask); + CallInst(maskedStoreFunc, args); +} + + + +/** Scatter the given varying value to the locations given by the varying + lvalue (which should be an array of pointers with size equal to the + target's vector width. We want to store each rvalue element at the + corresponding pointer's location, *if* the mask for the corresponding + program instance are on. If they're off, don't do anything. +*/ +void +FunctionEmitContext::scatter(llvm::Value *rvalue, llvm::Value *lvalue, + llvm::Value *storeMask, const Type *rvalueType) { + assert(rvalueType->IsVaryingType()); + assert(llvm::isa(lvalue->getType())); + + const StructType *structType = dynamic_cast(rvalueType); + if (structType) { + // Scatter the struct elements individually + for (int i = 0; i < structType->NumElements(); ++i) { + llvm::Value *lv = GetElementPtrInst(lvalue, 0, i); + llvm::Value *rv = ExtractInst(rvalue, i); + scatter(rv, lv, storeMask, structType->GetMemberType(i)); + } + return; + } + + const VectorType *vt = dynamic_cast(rvalueType); + if (vt) { + // FIXME: yuck. Change lvalues to be pointers to arrays so that + // the GEP stuff in the loop below ends up computing pointers based + // on elements in the vectors rather than incorrectly advancing to + // the next vector... + const llvm::Type *eltType = vt->GetBaseType()->GetAsUniformType()->LLVMType(g->ctx); + lvalue = BitCastInst(lvalue, llvm::PointerType::get(llvm::ArrayType::get(eltType, 0), 0)); + + for (int i = 0; i < vt->GetElementCount(); ++i) { + llvm::Value *lv = GetElementPtrInst(lvalue, 0, i); + llvm::Value *rv = ExtractInst(rvalue, i); + scatter(rv, lv, storeMask, vt->GetElementType()); + } + return; + } + + // I think this should be impossible + assert(dynamic_cast(rvalueType) == NULL); + + // And everything should be atomic from here on out... + assert(dynamic_cast(rvalueType) != NULL); + + llvm::Function *func = NULL; + const llvm::Type *type = rvalue->getType(); + if (type == LLVMTypes::DoubleVectorType || + type == LLVMTypes::Int64VectorType) { + func = m->module->getFunction("__pseudo_scatter_64"); + rvalue = BitCastInst(rvalue, LLVMTypes::Int64VectorType, "rvalue2int"); + } + else { + // FIXME: if this hits, presumably it's due to needing int8 and/or + // int16 versions of scatter... + assert(type == LLVMTypes::FloatVectorType || + type == LLVMTypes::Int32VectorType); + func = m->module->getFunction("__pseudo_scatter_32"); + rvalue = BitCastInst(rvalue, LLVMTypes::Int32VectorType, "rvalue2int"); + } + assert(func != NULL); + + AddInstrumentationPoint("scatter"); + + llvm::Value *voidlvalue = BitCastInst(lvalue, LLVMTypes::VoidPointerType); + std::vector args; + args.push_back(voidlvalue); + args.push_back(rvalue); + args.push_back(storeMask); + llvm::Instruction *inst = CallInst(func, args); + addGSMetadata(inst, currentPos); +} + + +void +FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, + const char *name) { + if (rvalue == NULL || lvalue == NULL) { + // may happen due to error elsewhere + assert(m->errorCount > 0); + return; + } + + llvm::Instruction *inst = new llvm::StoreInst(rvalue, lvalue, name, bblock); + AddDebugPos(inst); +} + + +void +FunctionEmitContext::StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, + llvm::Value *storeMask, const Type *rvalueType, + const char *name) { + if (rvalue == NULL || lvalue == NULL) { + // may happen due to error elsewhere + assert(m->errorCount > 0); + return; + } + + // Figure out what kind of store we're doing here + if (rvalueType->IsUniformType()) { + // The easy case; a regular store + llvm::Instruction *si = new llvm::StoreInst(rvalue, lvalue, name, bblock); + AddDebugPos(si); + } + else if (llvm::isa(lvalue->getType())) + // We have a varying lvalue (an array of pointers), so it's time to + // scatter + scatter(rvalue, lvalue, storeMask, rvalueType); + else if (storeMask == LLVMMaskAllOn) { + // Otherwise it is a masked store unless we can determine that the + // mask is all on... + llvm::Instruction *si = + new llvm::StoreInst(rvalue, lvalue, name, bblock); + AddDebugPos(si); + } + else + maskedStore(rvalue, lvalue, rvalueType, storeMask); +} + + +void +FunctionEmitContext::BranchInst(llvm::BasicBlock *dest) { + llvm::Instruction *b = llvm::BranchInst::Create(dest, bblock); + AddDebugPos(b); +} + + +void +FunctionEmitContext::BranchInst(llvm::BasicBlock *trueBlock, + llvm::BasicBlock *falseBlock, + llvm::Value *test) { + if (test == NULL) { + assert(m->errorCount > 0); + return; + } + + llvm::Instruction *b = + llvm::BranchInst::Create(trueBlock, falseBlock, test, bblock); + AddDebugPos(b); +} + + +llvm::Value * +FunctionEmitContext::ExtractInst(llvm::Value *v, int elt, const char *name) { + if (v == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Instruction *ei = NULL; + if (llvm::isa(v->getType())) + ei = llvm::ExtractElementInst::Create(v, LLVMInt32(elt), + name ? name : "extract", bblock); + else + ei = llvm::ExtractValueInst::Create(v, elt, name ? name : "extract", + bblock); + AddDebugPos(ei); + return ei; +} + + +llvm::Value * +FunctionEmitContext::InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, + const char *name) { + if (v == NULL || eltVal == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Instruction *ii = NULL; + if (llvm::isa(v->getType())) + ii = llvm::InsertElementInst::Create(v, eltVal, LLVMInt32(elt), + name ? name : "insert", bblock); + else + ii = llvm::InsertValueInst::Create(v, eltVal, elt, + name ? name : "insert", bblock); + AddDebugPos(ii); + return ii; +} + + +llvm::PHINode * +FunctionEmitContext::PhiNode(const llvm::Type *type, int count, + const char *name) { + llvm::PHINode *pn = llvm::PHINode::Create(type, +#if !defined(LLVM_2_8) && !defined(LLVM_2_9) + count, +#endif // !LLVM_2_8 && !LLVM_2_9 + name ? name : "phi", bblock); + AddDebugPos(pn); + return pn; +} + + +llvm::Instruction * +FunctionEmitContext::SelectInst(llvm::Value *test, llvm::Value *val0, + llvm::Value *val1, const char *name) { + if (test == NULL || val0 == NULL || val1 == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Instruction *inst = + llvm::SelectInst::Create(test, val0, val1, name ? name : "select", + bblock); + AddDebugPos(inst); + return inst; +} + + +llvm::Instruction * +FunctionEmitContext::CallInst(llvm::Function *func, + const std::vector &args, + const char *name) { + if (func == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Instruction *ci = + llvm::CallInst::Create(func, args.begin(), args.end(), + name ? name : "", bblock); + AddDebugPos(ci); + return ci; +} + + +llvm::Instruction * +FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg, + const char *name) { + if (func == NULL || arg == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Value *args[] = { arg }; + llvm::Instruction *ci = + llvm::CallInst::Create(func, &args[0], &args[1], name ? name : "", + bblock); + AddDebugPos(ci); + return ci; +} + + +llvm::Instruction * +FunctionEmitContext::CallInst(llvm::Function *func, llvm::Value *arg0, + llvm::Value *arg1, const char *name) { + if (func == NULL || arg0 == NULL || arg1 == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + llvm::Value *args[] = { arg0, arg1 }; + llvm::Instruction *ci = + llvm::CallInst::Create(func, &args[0], &args[2], name ? name : "", + bblock); + AddDebugPos(ci); + return ci; +} + + +llvm::Instruction * +FunctionEmitContext::ReturnInst() { + if (launchedTasks) { + // Automatically add a sync call at the end of any function that + // launched tasks + SourcePos noPos; + noPos.name = "__auto_sync"; + ExprStmt *es = new ExprStmt(new SyncExpr(noPos), noPos); + es->EmitCode(this); + delete es; + } + + llvm::Instruction *rinst = NULL; + if (returnValuePtr != NULL) { + // We have value(s) to return; load them from their storage + // location + llvm::Value *retVal = LoadInst(returnValuePtr, returnType, + "return_value"); + rinst = llvm::ReturnInst::Create(*g->ctx, retVal, bblock); + } + else { + assert(returnType == AtomicType::Void); + rinst = llvm::ReturnInst::Create(*g->ctx, bblock); + } + + AddDebugPos(rinst); + bblock = NULL; + return rinst; +} + + +llvm::Instruction * +FunctionEmitContext::LaunchInst(llvm::Function *callee, + std::vector &argVals) { + if (callee == NULL) { + assert(m->errorCount > 0); + return NULL; + } + + launchedTasks = true; + + const llvm::Type *argType = callee->arg_begin()->getType(); + assert(llvm::PointerType::classof(argType)); + const llvm::PointerType *pt = static_cast(argType); + assert(llvm::StructType::classof(pt->getElementType())); + const llvm::StructType *argStructType = + static_cast(pt->getElementType()); + assert(argStructType->getNumElements() == argVals.size() + 1); + + // Use alloca for space for the task args. KEY DETAIL: pass false + // to the call of FunctionEmitContext::AllocaInst so that the alloca + // doesn't happen just once at the top of the function, but happens + // each time the enclosing basic block executes. + int align = 4 * RoundUpPow2(g->target.nativeVectorWidth); + llvm::Value *argmem = AllocaInst(argStructType, "argmem", align, false); + llvm::Value *voidmem = BitCastInst(argmem, LLVMTypes::VoidPointerType); + + // Copy the values of the parameters into the appropriate place in + // the argument block + for (unsigned int i = 0; i < argVals.size(); ++i) { + llvm::Value *ptr = GetElementPtrInst(argmem, 0, i, "funarg"); + // don't need to do masked store here, I think + StoreInst(argVals[i], ptr); + } + + // copy in the mask + llvm::Value *mask = GetMask(); + llvm::Value *ptr = GetElementPtrInst(argmem, 0, argVals.size(), + "funarg_mask"); + StoreInst(mask, ptr); + + // And emit the call to the user-supplied task launch function, passing + // a pointer to the task function being called and a pointer to the + // argument block we just filled in + llvm::Value *fptr = BitCastInst(callee, LLVMTypes::VoidPointerType); + llvm::Function *flaunch = m->module->getFunction("ISPCLaunch"); + assert(flaunch != NULL); + return CallInst(flaunch, fptr, voidmem, ""); +} diff --git a/ctx.h b/ctx.h new file mode 100644 index 00000000..437c5e3f --- /dev/null +++ b/ctx.h @@ -0,0 +1,507 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file ctx.h + @brief Declaration of the FunctionEmitContext class +*/ + +#ifndef ISPC_CTX_H +#define ISPC_CTX_H 1 + +#include "ispc.h" +#include +#include +#ifndef LLVM_2_8 +#include +#endif +#include + +struct CFInfo; + +/** FunctionEmitContext is one of the key classes in ispc; it is used to + help with emitting the intermediate representation of a function during + compilation. It carries information the current program context during + IR emission (e.g. the basic block into which instructions should be + added; or, the current source file and line number, so debugging + symbols can be correctly generated). This class also provides a number + of helper routines that are useful for code that emits IR. + */ +class FunctionEmitContext { +public: + /** Create a new FunctionEmitContext. + @param returnType The return type of the function + @param function LLVM function in the current module that corresponds + to the function + @param funSym Symbol that corresponds to the function + @param firstStmtPos Source file position of the first statement in the + function + */ + FunctionEmitContext(const Type *returnType, llvm::Function *function, Symbol *funSym, + SourcePos firstStmtPos); + ~FunctionEmitContext(); + + /** @name Current basic block management + @{ + */ + /** Returns the current basic block pointer */ + llvm::BasicBlock *GetCurrentBasicBlock(); + + /** Set the given llvm::BasicBlock to be the basic block to emit + forthcoming instructions into. */ + void SetCurrentBasicBlock(llvm::BasicBlock *bblock); + + /** @name Mask management + @{ + */ + /** Returns the current mask value */ + llvm::Value *GetMask(); + + /** Provides the value of the mask at function entry */ + void SetEntryMask(llvm::Value *val); + + /** Sets the mask to a new value */ + void SetMask(llvm::Value *val); + + /** Sets the mask to (oldMask & val) */ + void MaskAnd(llvm::Value *oldMask, llvm::Value *val); + + /** Sets the mask to (oldMask & ~val) */ + void MaskAndNot(llvm::Value *oldMask, llvm::Value *test); + + /** Emits a branch instruction to the basic block btrue if any of the + lanes of current mask are on and bfalse if none are on. */ + void BranchIfMaskAny(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse); + + /** Emits a branch instruction to the basic block btrue if all of the + lanes of current mask are on and bfalse if none are on. */ + void BranchIfMaskAll(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse); + + /** Emits a branch instruction to the basic block btrue if none of the + lanes of current mask are on and bfalse if none are on. */ + void BranchIfMaskNone(llvm::BasicBlock *btrue, llvm::BasicBlock *bfalse); + /** @} */ + + /** @name Control flow management + @{ + */ + /** Notifies the FunctionEmitContext that we're starting emission of an + 'if' statement with a uniform test. The value of the mask going + into the 'if' statement is provided in the oldMask parameter. */ + void StartUniformIf(llvm::Value *oldMask); + + /** Notifies the FunctionEmitContext that we're starting emission of an + 'if' statement with a varying test. The value of the mask going + into the 'if' statement is provided in the oldMask parameter. */ + void StartVaryingIf(llvm::Value *oldMask); + + /** Notifies the FunctionEmitConitext that we're done emitting the IR + for an 'if' statement. */ + void EndIf(); + + /** Notifies the FunctionEmitContext that we're starting to emit IR + for a loop. Basic blocks are provides for where 'break' and + 'continue' statements should jump to (if all running lanes want to + break or continue), uniformControlFlow indicates whether the loop + condition is 'uniform', and oldMask provides the current mask going + into the loop. */ + void StartLoop(llvm::BasicBlock *breakTarget, llvm::BasicBlock *continueTarget, + bool uniformControlFlow, llvm::Value *oldMask); + + /** Informs FunctionEmitContext of the value of the mask at the start + of a loop body. */ + void SetLoopMask(llvm::Value *mask); + + /** Informs FunctionEmitContext that code generation for a loop is + finished. */ + void EndLoop(); + + /** Emit code for a 'break' statement in a loop. If doCoherenceCheck + is true, then if we're in a 'varying' loop, code will be emitted to + see if all of the lanes want to break, in which case a jump to the + break target will be taken. (For 'uniform' loops, the jump is + always done). */ + void Break(bool doCoherenceCheck); + + /** Emit code for a 'continue' statement in a loop. If + doCoherenceCheck is true, then if we're in a 'varying' loop, code + will be emitted to see if all of the lanes want to continue, in + which case a jump to the continue target will be taken. (For + 'uniform' loops, the jump is always done). */ + void Continue(bool doCoherenceCheck); + + /** This method is called by code emitting IR for a loop at the end of + the loop body; it restores the lanes of the mask that executed a + 'continue' statement when going through the loop body in the + previous iteration. */ + void RestoreContinuedLanes(); + + /** Returns the current number of nested levels of 'varying' control + flow */ + int VaryingCFDepth() const; + + /** Called to generate code for 'return' statement; value is the + expression in the return statement (if non-NULL), and + doCoherenceCheck indicates whether instructions should be generated + to see if all of the currently-running lanes have returned (if + we're under varying control flow). */ + void CurrentLanesReturned(Expr *value, bool doCoherenceCheck); + /** @} */ + + /** @name Small helper/utility routines + @{ + */ + /** Given a boolean mask value of type LLVMTypes::MaskType, return an + i1 value that indicates if any of the mask lanes are on. */ + llvm::Value *Any(llvm::Value *mask); + + /** Given a boolean mask value of type LLVMTypes::MaskType, return an + i1 value that indicates if all of the mask lanes are on. */ + llvm::Value *All(llvm::Value *mask); + + /** Given a boolean mask value of type LLVMTypes::MaskType, return an + i32 value wherein the i'th bit is on if and only if the i'th lane + of the mask is on. */ + llvm::Value *LaneMask(llvm::Value *mask); + + /** Given two masks of type LLVMTypes::MaskType, return an i1 value + that indicates whether the two masks are equal. */ + llvm::Value *MasksAllEqual(llvm::Value *mask1, llvm::Value *mask2); + + /** Given a string, create an anonymous global variable to hold its + value and return the pointer to the string. */ + llvm::Value *GetStringPtr(const std::string &str); + + /** Create a new basic block with given name */ + llvm::BasicBlock *CreateBasicBlock(const char *name); + + /** Given a vector with element type i1, return a vector of type + LLVMTypes::BoolVectorType. This method handles the conversion for + the targets where the bool vector element type is, for example, + i32. */ + llvm::Value *I1VecToBoolVec(llvm::Value *b); + + /** Emit code to call the user-supplied ISPCMalloc function to + allocate space for an object of thee given type. Returns the + pointer value returned by the ISPCMalloc call. */ + llvm::Value *EmitMalloc(const llvm::Type *ty); + + /** Emit code to call the user-supplied ISPCFree function, passing it + the given pointer to storage previously allocated by an + EmitMalloc() call. */ + void EmitFree(llvm::Value *ptr); + + /** If the user has asked to compile the program with instrumentation, + this inserts a callback to the user-supplied instrumentation + function at the current point in the code. */ + void AddInstrumentationPoint(const char *note); + /** @} */ + + /** @name Debugging support + @{ + */ + /** Set the current source file position; subsequent emitted + instructions will have this position associated with them if + debugging information is being generated. */ + void SetDebugPos(SourcePos pos); + + SourcePos GetDebugPos() const; + + /** Adds debugging metadata to the given instruction. If pos == NULL, + use FunctionEmitContext::currentPos as the source file position for + the instruction. Similarly, if a DIScope is provided, it's used + and otherwise the scope is found from a GetDIScope() call. This + takes a llvm::Value for the instruction rather than an + llvm::Instruction for convenience; in calling code we often have + Instructions stored using Value pointers; the code here returns + silently if it's not actually given an instruction. */ + void AddDebugPos(llvm::Value *instruction, const SourcePos *pos = NULL, + llvm::DIScope *scope = NULL); + + /** Inform the debugging information generation code that a new scope + is starting in the source program. */ + void StartScope(); + + /** Inform the debugging information generation code that the current + scope is ending in the source program. */ + void EndScope(); + + /** Returns the llvm::DIScope corresponding to the current program + scope. */ + llvm::DIScope GetDIScope() const; + + /** Emits debugging information for the variable represented by + sym. */ + void EmitVariableDebugInfo(Symbol *sym); + + /** Emits debugging information for the function parameter represented + by sym. */ + void EmitFunctionParameterDebugInfo(Symbol *sym); + /** @} */ + + /** @name IR instruction emission + @brief These methods generally closely correspond to LLVM IR + instructions. See the LLVM assembly language reference manual + (http://llvm.org/docs/LangRef.html) and the LLVM doxygen documentaion + (http://llvm.org/doxygen) for more information. Here we will only + document significant generalizations to the functionality of the + corresponding basic LLVM instructions. + + Beyond actually emitting the instruction, the implementations of + these methods in FunctionEmitContext also handle adding debugging + metadata if debugging symbols are enabled, adding the instructions + to the current basic block, and handling generalizations like + 'varying' lvalues, arithmetic operations with VectorType operands, + etc. + @{ + */ + /** Emit the binary operator given by the inst parameter. If + llvm::Values corresponding to VectorTypes are given as operands, + this also handles applying the given operation to the vector + elements. */ + llvm::Value *BinaryOperator(llvm::Instruction::BinaryOps inst, + llvm::Value *v0, llvm::Value *v1, + const char *name = NULL); + + /** Emit the "not" operator. Like BinaryOperator(), this also handles + a VectorType-based operand. */ + llvm::Value *NotOperator(llvm::Value *v, const char *name = NULL); + + /** Emit a comparison instruction. If the operands are VectorTypes, + then a value for the corresponding boolean VectorType is + returned. */ + llvm::Value *CmpInst(llvm::Instruction::OtherOps inst, + llvm::CmpInst::Predicate pred, + llvm::Value *v0, llvm::Value *v1, const char *name = NULL); + + llvm::Value *BitCastInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *PtrToIntInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *IntToPtrInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *TruncInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *CastInst(llvm::Instruction::CastOps op, llvm::Value *value, + const llvm::Type *type, const char *name = NULL); + llvm::Instruction *FPCastInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *SExtInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + llvm::Instruction *ZExtInst(llvm::Value *value, const llvm::Type *type, + const char *name = NULL); + + /** This GEP method is a generalization of the standard one in LLVM; it + supports both uniform and varying basePtr values (an array of + pointers) as well as uniform and varying index values (arrays of + indices). */ + llvm::Value *GetElementPtrInst(llvm::Value *basePtr, llvm::Value *index0, + llvm::Value *index1, const char *name = NULL); + + /** This is a convenience method to generate a GEP instruction with + indices with values with known constant values as the ispc program + is being compiled. */ + llvm::Value *GetElementPtrInst(llvm::Value *basePtr, int v0, int v1, + const char *name = NULL); + + /** Load from the memory location(s) given by lvalue. The lvalue may + be varying, in which case this corresponds to a gather from the + multiple memory locations given by the array of pointer values + given by the lvalue. If the lvalue is not varying, then the type + parameter may be NULL. */ + llvm::Value *LoadInst(llvm::Value *lvalue, const Type *type, + const char *name = NULL); + + /** Emits an alloca instruction to allocate stack storage for the given + type. If a non-zero alignment is specified, the object is also + allocated at the given alignment. By default, the alloca + instruction is added at the start of the function in the entry + basic block; if it should be added to the current basic block, then + the atEntryBlock parameter should be false. */ + llvm::Value *AllocaInst(const llvm::Type *llvmType, const char *name = NULL, + int align = 0, bool atEntryBlock = true); + + /** Standard store instruction; for this variant, the lvalue must be a + single pointer, not a varying lvalue. */ + void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, + const char *name = NULL); + + /** In this variant of StoreInst(), the lvalue may be varying. If so, + this corresponds to a scatter. Whether the lvalue is uniform of + varying, the given storeMask is used to mask the stores so that + they only execute for the active program instances. */ + void StoreInst(llvm::Value *rvalue, llvm::Value *lvalue, + llvm::Value *storeMask, const Type *rvalueType, + const char *name = NULL); + + void BranchInst(llvm::BasicBlock *block); + void BranchInst(llvm::BasicBlock *trueBlock, llvm::BasicBlock *falseBlock, + llvm::Value *test); + + /** This convenience method maps to an llvm::ExtractElementInst if the + given value is a llvm::VectorType, and to an llvm::ExtractValueInst + otherwise. */ + llvm::Value *ExtractInst(llvm::Value *v, int elt, const char *name = NULL); + + /** This convenience method maps to an llvm::InsertElementInst if the + given value is a llvm::VectorType, and to an llvm::InsertValueInst + otherwise. */ + llvm::Value *InsertInst(llvm::Value *v, llvm::Value *eltVal, int elt, + const char *name = NULL); + + llvm::PHINode *PhiNode(const llvm::Type *type, int count, const char *name = NULL); + llvm::Instruction *SelectInst(llvm::Value *test, llvm::Value *val0, + llvm::Value *val1, const char *name = NULL); + + llvm::Instruction *CallInst(llvm::Function *func, + const std::vector &args, + const char *name = NULL); + /** This is a convenience method that issues a call instruction to a + function that takes just a single argument. */ + llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg, + const char *name = NULL); + + /** This is a convenience method that issues a call instruction to a + function that takes two arguments. */ + llvm::Instruction *CallInst(llvm::Function *func, llvm::Value *arg0, + llvm::Value *arg1, const char *name = NULL); + + /** Launch an asynchronous task to run the given function, passing it + he given argument values. */ + llvm::Instruction *LaunchInst(llvm::Function *callee, + std::vector &argVals); + + llvm::Instruction *ReturnInst(); + /** @} */ + +private: + /** The basic block into which we add any alloca instructions that need + to go at the very start of the function. */ + llvm::BasicBlock *allocaBlock; + + /** The current basic block into which we're emitting new + instructions */ + llvm::BasicBlock *bblock; + + /** Pointer to stack-allocated memory that stores the current value of + the program mask. */ + llvm::Value *maskPtr; + + /** Current source file position; if debugging information is being + generated, this position is used to set file/line information for + instructions. */ + SourcePos currentPos; + + /** Source file position where the function definition started. Used + for error messages and debugging symbols. */ + SourcePos funcStartPos; + + /** Type of result that the current function returns. */ + const Type *returnType; + + /** Value of the program mask when the function starts execution. */ + llvm::Value *entryMask; + + /** If currently in a loop body, the value of the mask at the start of + the loop. */ + llvm::Value *loopMask; + + /** If currently in a loop body, this is a pointer to memory to store a + mask value that represents which of the lanes have executed a + 'break' statement. If we're not in a loop body, this should be + NULL. */ + llvm::Value *breakLanesPtr; + + /** Similar to breakLanesPtr, if we're inside a loop, this is a pointer + to memory to record which of the program instances have executed a + 'continue' statement. */ + llvm::Value *continueLanesPtr; + + /** If we're inside a loop, this gives the basic block immediately + after the current loop, which we will jump to if all of the lanes + have executed a break statement or are otherwise done with the + loop. */ + llvm::BasicBlock *breakTarget; + + /** If we're inside a loop, this gives the block to jump to if all of + the running lanes have executed a 'continue' statement. */ + llvm::BasicBlock *continueTarget; + + /** A pointer to memory that records which of the program instances + have executed a 'return' statement (and are thus really truly done + running any more instructions in this functions. */ + llvm::Value *returnedLanesPtr; + + /** A pointer to memory to store the return value for the function. + Since difference program instances may execute 'return' statements + at different times, we need to accumulate the return values as they + come in until we return for real. */ + llvm::Value *returnValuePtr; + + /** The CFInfo structure records information about a nesting level of + control flow. This vector lets us see what control flow is going + around outside the current position in the function being + emitted. */ + std::vector controlFlowInfo; + + /** DIFile object corresponding to the source file where the current + function was defined (used for debugging info0. */ + llvm::DIFile diFile; + + /** DISubprogram corresponding to this function (used for debugging + info). */ + llvm::DISubprogram diFunction; + + /** These correspond to the current set of nested scopes in the + function. */ + std::vector debugScopes; + + /** True if a 'launch' statement has been encountered in the function. */ + bool launchedTasks; + + llvm::Value *pointerVectorToVoidPointers(llvm::Value *value); + static void addGSMetadata(llvm::Instruction *inst, SourcePos pos); + bool ifsInLoopAllUniform() const; + void jumpIfAllLoopLanesAreDone(llvm::BasicBlock *target); + llvm::Value *emitGatherCallback(llvm::Value *lvalue, llvm::Value *retPtr); + + void restoreMaskGivenReturns(llvm::Value *oldMask); + + void scatter(llvm::Value *rvalue, llvm::Value *lvalue, + llvm::Value *maskPtr, const Type *rvalueType); + llvm::Value *gather(llvm::Value *lvalue, const Type *type, + const char *name); + void maskedStore(llvm::Value *rvalue, llvm::Value *lvalue, + const Type *rvalueType, llvm::Value *maskPtr); +}; + +#endif // ISPC_CTX_H diff --git a/decl.cpp b/decl.cpp new file mode 100644 index 00000000..849347f4 --- /dev/null +++ b/decl.cpp @@ -0,0 +1,348 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file decl.cpp + @brief Implementations of classes related to turning declarations into + symbols and types. +*/ + +#include "decl.h" +#include "util.h" +#include "sym.h" +#include "type.h" +#include "expr.h" +#include + +/////////////////////////////////////////////////////////////////////////// +// DeclSpecs + +DeclSpecs::DeclSpecs(const Type *t, StorageClass sc, int tq) { + baseType = t; + storageClass = sc; + typeQualifier = tq; + soaWidth = 0; + vectorSize = 0; +} + + +void +DeclSpecs::Print() const { + if (storageClass == SC_EXTERN) printf("extern "); + if (storageClass == SC_EXTERN_C) printf("extern \"C\" "); + if (storageClass == SC_EXPORT) printf("export "); + if (storageClass == SC_STATIC) printf("static "); + if (storageClass == SC_TYPEDEF) printf("typedef "); + + if (soaWidth > 0) printf("soa<%d> ", soaWidth); + + if (typeQualifier & TYPEQUAL_INLINE) printf("inline "); + if (typeQualifier & TYPEQUAL_CONST) printf("const "); + if (typeQualifier & TYPEQUAL_UNIFORM) printf("uniform "); + if (typeQualifier & TYPEQUAL_VARYING) printf("varying "); + if (typeQualifier & TYPEQUAL_TASK) printf("task "); + if (typeQualifier & TYPEQUAL_REFERENCE) printf("reference "); + if (typeQualifier & TYPEQUAL_UNSIGNED) printf("unsigned "); + + printf("%s", baseType->GetString().c_str()); + + if (vectorSize > 0) printf("<%d>", vectorSize); +} + + +/////////////////////////////////////////////////////////////////////////// +// Declarator + +Declarator::Declarator(Symbol *s, SourcePos p) + : pos(p) { + sym = s; + functionArgs = NULL; + isFunction = false; + initExpr = NULL; +} + + +void +Declarator::AddArrayDimension(int size) { + assert(size > 0 || size == -1); // -1 -> unsized + arraySize.push_back(size); +} + + +void +Declarator::InitFromDeclSpecs(DeclSpecs *ds) { + sym->type = GetType(ds); + + if (ds->storageClass == SC_STATIC) + sym->isStatic = true; +} + + +void +Declarator::Print() const { + printf("%s", sym->name.c_str()); + if (initExpr != NULL) { + printf(" = ("); + initExpr->Print(); + printf(")"); + } + pos.Print(); +} + + +static const Type * +lGetType(const Declarator *decl, DeclSpecs *ds, + std::vector::const_iterator arrayIter) { + if (arrayIter == decl->arraySize.end()) { + // If we don't have an array (or have processed all of the array + // dimensions in previous recursive calls), we can go ahead and + // figure out the final non-array type we have here. + const Type *type = ds->baseType; + if (type == NULL) { + Error(decl->pos, "Type not provided in variable declaration for variable \"%s\".", + decl->sym->name.c_str()); + return NULL; + } + + // Account for 'unsigned' and 'const' qualifiers in the type + if ((ds->typeQualifier & TYPEQUAL_UNSIGNED) != 0) { + const Type *unsignedType = type->GetAsUnsignedType(); + if (unsignedType != NULL) + type = unsignedType; + else + Error(decl->pos, "\"unsigned\" qualifier is illegal with \"%s\" type.", + type->GetString().c_str()); + } + if ((ds->typeQualifier & TYPEQUAL_CONST) != 0) + type = type->GetAsConstType(); + + if (ds->vectorSize > 0) { + const AtomicType *atomicType = dynamic_cast(type); + if (atomicType == NULL) { + Error(decl->pos, "Only atomic types (int, float, ...) are legal for vector " + "types."); + return NULL; + } + type = new VectorType(atomicType, ds->vectorSize); + } + + // if uniform/varying is specified explicitly, then go with that + if ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0) + return type->GetAsUniformType(); + else if ((ds->typeQualifier & TYPEQUAL_VARYING) != 0) + return type->GetAsVaryingType(); + else { + // otherwise, structs are uniform by default and everything + // else is varying by default + if (dynamic_cast(type) != NULL) + return type->GetAsUniformType(); + else + return type->GetAsVaryingType(); + } + } + else { + // Peel off one dimension of the array + int arraySize = *arrayIter; + ++arrayIter; + + // Get the type, not including the arraySize dimension peeled off + // above. + const Type *childType = lGetType(decl, ds, arrayIter); + + int soaWidth = ds->soaWidth; + if (soaWidth == 0) + // If there's no "soa" stuff going on, just return a regular + // array with the appropriate size + return new ArrayType(childType, arraySize == -1 ? 0 : arraySize); + else { + // Make sure we actually have an array of structs .. + const StructType *childStructType = + dynamic_cast(childType); + if (childStructType == NULL) { + Error(decl->pos, "Illegal to provide soa<%d> qualifier with non-struct " + "type \"%s\".", soaWidth, childType->GetString().c_str()); + return new ArrayType(childType, arraySize == -1 ? 0 : arraySize); + } + else if ((soaWidth & (soaWidth - 1)) != 0) { + Error(decl->pos, "soa<%d> width illegal. Value must be power of two.", + soaWidth); + return NULL; + } + else if (arraySize != -1 && (arraySize % soaWidth) != 0) { + Error(decl->pos, "soa<%d> width must evenly divide array size %d.", + soaWidth, arraySize); + return NULL; + } + return new SOAArrayType(childStructType, arraySize == -1 ? 0 : arraySize, + soaWidth); + } + } +} + + +const Type * +Declarator::GetType(DeclSpecs *ds) const { + bool hasUniformQual = ((ds->typeQualifier & TYPEQUAL_UNIFORM) != 0); + bool hasVaryingQual = ((ds->typeQualifier & TYPEQUAL_VARYING) != 0); + bool isTask = ((ds->typeQualifier & TYPEQUAL_TASK) != 0); + bool isReference = ((ds->typeQualifier & TYPEQUAL_REFERENCE) != 0); + + if (hasUniformQual && hasVaryingQual) { + Error(pos, "Can't provide both \"uniform\" and \"varying\" qualifiers."); + return NULL; + } + + if (isFunction) { + std::vector args; + std::vector argNames; + if (functionArgs) { + // Loop over the function arguments and get names and types for + // each one in the args and argNames arrays + for (unsigned int i = 0; i < functionArgs->size(); ++i) { + Declaration *d = (*functionArgs)[i]; + Symbol *sym; + if (d->declarators.size() == 0) { + // function declaration like foo(float), w/o a name for + // the parameter + char buf[32]; + sprintf(buf, "__anon_parameter_%d", i); + sym = new Symbol(buf, pos); + Declarator *declarator = new Declarator(sym, sym->pos); + sym->type = declarator->GetType(ds); + d->declarators.push_back(declarator); + } + else { + assert(d->declarators.size() == 1); + sym = d->declarators[0]->sym; + } + + // Arrays are passed by reference, so convert array + // parameters to be references here. + if (dynamic_cast(sym->type) != NULL) + sym->type = new ReferenceType(sym->type, sym->type->IsConstType()); + + args.push_back(sym->type); + argNames.push_back(sym->name); + } + } + + if (ds->baseType == NULL) { + Warning(pos, "No return type provided in declaration of function \"%s\". " + "Treating as \"void\".", sym->name.c_str()); + ds->baseType = AtomicType::Void; + } + + if (isReference) { + Error(pos, "Function return types can't be reference types."); + return NULL; + } + + const Type *returnType = lGetType(this, ds, arraySize.begin()); + if (returnType == NULL) + return NULL; + + bool isExported = (ds->storageClass == SC_EXPORT); + bool isExternC = (ds->storageClass == SC_EXTERN_C); + return new FunctionType(returnType, args, pos, &argNames, isTask, + isExported, isExternC); + } + else { + if (isTask) + Error(pos, "\"task\" qualifier illegal in variable declaration \"%s\".", + sym->name.c_str()); + + const Type *type = lGetType(this, ds, arraySize.begin()); + + if (type != NULL && isReference) { + bool hasConstQual = ((ds->typeQualifier & TYPEQUAL_CONST) != 0); + type = new ReferenceType(type, hasConstQual); + } + + return type; + } +} + +/////////////////////////////////////////////////////////////////////////// +// Declaration + +void +Declaration::AddSymbols(SymbolTable *st) const { + assert(declSpecs->storageClass != SC_TYPEDEF); + + for (unsigned int i = 0; i < declarators.size(); ++i) + if (declarators[i]) + st->AddVariable(declarators[i]->sym); +} + + +void +Declaration::Print() const { + printf("Declaration: specs ["); + declSpecs->Print(); + printf("], declarators ["); + for (unsigned int i = 0 ; i < declarators.size(); ++i) { + declarators[i]->Print(); + printf("%s", (i == declarators.size() - 1) ? "]" : ", "); + } +} + +/////////////////////////////////////////////////////////////////////////// + +void +GetStructTypesAndNames(const std::vector &sd, + std::vector *elementTypes, + std::vector *elementNames) { + for (unsigned int i = 0; i < sd.size(); ++i) { + const Type *type = sd[i]->type; + // FIXME: making this fake little DeclSpecs here is really + // disgusting + DeclSpecs ds(type); + if (type->IsUniformType()) + ds.typeQualifier |= TYPEQUAL_UNIFORM; + else + ds.typeQualifier |= TYPEQUAL_VARYING; + + for (unsigned int j = 0; j < sd[i]->declarators->size(); ++j) { + Declarator *d = (*sd[i]->declarators)[j]; + d->InitFromDeclSpecs(&ds); + + // if it's an unsized array, make it a reference to an unsized + // array, so the caller can pass a pointer... + const ArrayType *at = dynamic_cast(d->sym->type); + if (at && at->GetElementCount() == 0) + d->sym->type = new ReferenceType(d->sym->type, type->IsConstType()); + + elementTypes->push_back(d->sym->type); + elementNames->push_back(d->sym->name); + } + } +} diff --git a/decl.h b/decl.h new file mode 100644 index 00000000..84f6147e --- /dev/null +++ b/decl.h @@ -0,0 +1,203 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file decl.h + @brief Declarations related to type declarations; the parser basically + creates instances of these classes, which are then turned into actual + Types. + + Three classes work together to represent declarations. As an example, + consider a declaration like: + + static uniform int foo, bar[10]; + + An instance of the Declaration class represents this entire declaration + of two variables, 'foo' and 'bar'. It holds a single instance of the + DeclSpecs class represents the common specifiers for all of the + variables--here, that the declaration has the 'static' and 'uniform' + qualifiers, and that it's basic type is 'int'. Then for each variable + declaration, the Declaraiton class holds an instance of a Declarator, + which in turn records the per-variable information like the symbol + name, array size (if any), initializer expression, etc. +*/ + +#ifndef ISPC_DECL_H +#define ISPC_DECL_H + +#include "ispc.h" + +enum StorageClass { + SC_NONE, + SC_EXTERN, + SC_EXPORT, + SC_STATIC, + SC_TYPEDEF, + SC_EXTERN_C +}; + + +/* Multiple qualifiers can be provided with types in declarations; + therefore, they are set up so that they can be ANDed together into an + int. */ +#define TYPEQUAL_NONE 0 +#define TYPEQUAL_CONST (1<<0) +#define TYPEQUAL_UNIFORM (1<<1) +#define TYPEQUAL_VARYING (1<<2) +#define TYPEQUAL_TASK (1<<3) +#define TYPEQUAL_REFERENCE (1<<4) +#define TYPEQUAL_UNSIGNED (1<<5) +#define TYPEQUAL_INLINE (1<<6) + +/** @brief Representation of the declaration specifiers in a declaration. + + In other words, this represents all of the stuff that applies to all of + the (possibly multiple) variables in a declaration. + */ +class DeclSpecs { +public: + DeclSpecs(const Type *t = NULL, StorageClass sc = SC_NONE, int tq = TYPEQUAL_NONE); + + void Print() const; + + StorageClass storageClass; + + /** Zero or more of the TYPEQUAL_* values, ANDed together. */ + int typeQualifier; + + /** The basic type provided in the declaration; this should be an + AtomicType, a StructType, or a VectorType; other types (like + ArrayTypes) will end up being created if a particular declaration + has an array size, etc. + */ + const Type *baseType; + + /** If this is a declaration with a vector type, this gives the vector + width. For non-vector types, this is zero. + */ + int vectorSize; + + /** If this is a declaration with an "soa" qualifier, this gives the + SOA width specified. Otherwise this is zero. + */ + int soaWidth; +}; + + +/** @brief Representation of the declaration of a single variable. + + In conjunction with an instance of the DeclSpecs, this gives us + everything we need for a full variable declaration. + */ +class Declarator { +public: + Declarator(Symbol *s, SourcePos p); + + /** As the parser peels off array dimension declarations after the + symbol name, it calls this method to provide them to the + Declarator. + */ + void AddArrayDimension(int size); + + /** Once a DeclSpecs instance is available, this method completes the + initialization of the Symbol, setting its Type accordingly. + */ + void InitFromDeclSpecs(DeclSpecs *ds); + + /** Get the actual type of the combination of Declarator and the given + DeclSpecs */ + const Type *GetType(DeclSpecs *ds) const; + + void Print() const; + + const SourcePos pos; + Symbol *sym; + /** If this declarator includes an array specification, the sizes of + the array dimensions are represented here. + */ + std::vector arraySize; + /** Initialization expression for the variable. May be NULL. */ + Expr *initExpr; + bool isFunction; + std::vector *functionArgs; +}; + + +/** @brief Representation of a full declaration of one or more variables, + including the shared DeclSpecs as well as the per-variable Declarators. + */ +class Declaration { +public: + Declaration(DeclSpecs *ds, std::vector *dlist = NULL) { + declSpecs = ds; + if (dlist != NULL) + declarators = *dlist; + for (unsigned int i = 0; i < declarators.size(); ++i) + if (declarators[i] != NULL) + declarators[i]->InitFromDeclSpecs(declSpecs); + } + Declaration(DeclSpecs *ds, Declarator *d) { + declSpecs = ds; + if (d) { + d->InitFromDeclSpecs(ds); + declarators.push_back(d); + } + } + + /** Adds the symbols for the variables in the declaration to the symbol + table. */ + void AddSymbols(SymbolTable *st) const; + void Print() const; + + DeclSpecs *declSpecs; + std::vector declarators; +}; + + +/** The parser creates instances of StructDeclaration for the members of + structs as it's parsing their declarations. */ +struct StructDeclaration { + StructDeclaration(const Type *t, std::vector *d) + : type(t), declarators(d) { } + + const Type *type; + std::vector *declarators; +}; + + +/** Given a set of StructDeclaration instances, this returns the types of + the elements of the corresponding struct and their names. */ +extern void GetStructTypesAndNames(const std::vector &sd, + std::vector *elementTypes, + std::vector *elementNames); + +#endif // ISPC_DECL_H diff --git a/docs/build.sh b/docs/build.sh new file mode 100755 index 00000000..6de1e93d --- /dev/null +++ b/docs/build.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +rst2html ispc.txt > ispc.html + +#rst2latex --section-numbering --documentclass=article --documentoptions=DIV=9,10pt,letterpaper ispc.txt > ispc.tex +#pdflatex ispc.tex +#/bin/rm -f ispc.aux ispc.log ispc.out ispc.tex diff --git a/docs/ispc.txt b/docs/ispc.txt new file mode 100644 index 00000000..76f13595 --- /dev/null +++ b/docs/ispc.txt @@ -0,0 +1,2640 @@ +========================================= +Intel® SPMD Program Compiler User's Guide +========================================= + +``ispc`` is a compiler for writing SPMD (single program multiple data) +programs to run on the CPU. The SPMD programming approach is widely known +to graphics and GPGPU programmers; it is used for GPU shaders and CUDA\* and +OpenCL\* kernels, for example. The main idea behind SPMD is that one writes +programs as if they were operating on a single data element (a pixel for a +pixel shader, for example), but then the underlying hardware and runtime +system executes multiple invocations of the program in parallel with +different inputs (the values for different pixels, for example). + +The main goals behind ``ispc`` are to: + +* Build a small C-like language that can deliver good performance to + performance-oriented programmers who want to run SPMD programs on + CPUs. +* Provide a thin abstraction layer between the programmer and the + hardware--in particular, to follow the lesson from C for serial programs + of having an execution and data model where the programmer can cleanly + reason about the mapping of their source program to compiled assembly + language and the underlying hardware. +* Harness the computational power of the Single Program, Multiple Data (SIMD) vector + units without the extremely low-programmer-productivity activity of directly + writing intrinsics. +* Explore opportunities from close-coupling between C/C++ application code + and SPMD ``ispc`` code running on the same processor--lightweight funcion + calls betwen the two languages, sharing data directly via pointers without + copying or reformating, etc. + +``ispc`` has already successfully delivered significant speedups for a +number of non-trivial workloads that aren't handled well by other +compilation approaches (e.g. loop auto-vectorization.) + +Contents: + +* `Recent Changes to ISPC`_ + +* `Getting Started with ISPC`_ + + + `Installing ISPC`_ + + `Compiling and Running a Simple ISPC Program`_ + +* `Using The ISPC Compiler`_ + + + `Command-line Options`_ + +* `The ISPC Language`_ + + + `Lexical Structure`_ + + `Basic Types and Type Qualifiers`_ + + `Short Vector Types`_ + + `Struct and Array Types`_ + + `Declarations and Initializers`_ + + `Function Declarations`_ + + `Expressions`_ + + `Control Flow`_ + + `Functions`_ + + `C Constructs not in ISPC`_ + +* `Parallel Execution Model in ISPC`_ + + + `The SPMD-on-SIMD Execution Model`_ + + `Uniform and Varying Qualifiers`_ + + `Mapping Data to Program Instances`_ + + `"Coherent" Control Flow Statements`_ + + `Program Instance Convergence`_ + + `Data Races`_ + + `Uniform Variables and Varying Control Flow`_ + + `Task Parallelism in ISPC`_ + +* `The ISPC Standard Library`_ + + + `Math Functions`_ + + `Output Functions`_ + + `Cross-Lane Operations`_ + + `Low-Level Bits`_ + +* `Interoperability with the Application`_ + + + `Interoperability Overview`_ + + `Data Layout`_ + + `Data Alignment and Aliasing`_ + +* `Using ISPC Effectively`_ + + + `Restructuring Existing Programs to Use ISPC`_ + + `Understanding How to Interoperate With the Application's Data`_ + + `Communicating Between SPMD Program Instances`_ + + `Gather and Scatter`_ + + `Low-level Vector Tricks`_ + + `Debugging`_ + + `The "Fast math" Option`_ + + `"Inline" Aggressively`_ + + `Small Performance Tricks`_ + + `Instrumenting Your ISPC Programs`_ + +* `Disclaimer and Legal Information`_ + +* `Optimization Notice`_ + +Recent Changes to ISPC +====================== + +This section summarizes recent changes and bugfixes. + +* 17 May: Fixed a number of bugs related to error handling in Windows*. In + particular, if you use the ``/E`` command line flag to ``cl.exe`` (rather + than ``/EP``) when using it as a preprocessor, then ``ispc`` will + correctly report the source file position with warnings and errors. + +* 15 May: Improved error messages and warnings in many cases. For example, + the column number is reported along with the line number and + the source line with the error is printed as part of the message. + +* 8 May: ``ispc``'s typechecker has been substantially improved in how it + handles ``const``-qualified types. Some programs that previously + compiled may now fail with errors related to ``const``. For example, + ``ispc`` issues an error message if you try to assign a member of a const + structure. + +* 2 May: "uniform" short-vector types are now stored across the lanes of + the SIMD registers. This enables you to also write classic 'explicit + vector' computation in ``ispc`` as well. This change does change how + these types are laid out in memory; see `Data Layout`_ for more details.) + +Getting Started with ISPC +========================= + +Installing ISPC +--------------- + +The `ispc downloads web page`_ has prebuilt executables for Windows\*, +Linux\* and Mac OS\* available for download. Alternatively, you can +download the source code from that page and build it yourself; see see the +`ispc wiki`_ for instructions about building ``ispc`` from source. + +.. _ispc downloads web page:downloads.html +.. _ispc wiki: http://github.com/ispc/ispc/wiki + +Once you have an executable for your system, copy it into a directory +that's in your ``PATH``. Congratulations--you've now installed ``ispc``. + +Compiling and Running a Simple ISPC Program +------------------------------------------- + +The directory ``examples/simple`` in the ``ispc`` distribution includes a +simple example of how to use ``ispc`` with a short C++ program. See the +file ``simple.ispc`` in that directory (also reproduced here.) + +:: + + export void simple(uniform float vin[], uniform float vout[], + uniform int count) { + for (uniform int i = 0; i < count; i += programCount) { + int index = i + programIndex; + float v = vin[index]; + if (v < 3.) + v = v * v; + else + v = sqrt(v); + vout[index] = v; + } + } + +This program loops over an array of values in ``vin`` and computes an +output value for each one. For each value in ``vin``, if its value is less +than three, the output is the value squared, otherwise it's the square root +of the value. + +The first thing to notice in this program is the presence of the ``export`` +keyword in the function definition; this indicates that the function should +be made available to be called from application code. The ``uniform`` +qualifiers on the parameters to ``simple`` as well as for the variable +``i`` indicate that the correpsonding variables are non-vector +quantities--they are discussed in detail in the `Uniform and Varying +Qualifiers`_ section. + +Each iteration of the for loop works on a number of input values in +parallel. The built-in ``programCount`` variable indicates how many +program instances are running in parallel; it is equal to the SIMD width of +the machine. (For example, the value is four on Intel® SSE, eight on +Intel® AVX, etc.) Thus, we can see that each execution of the loop will +work on that many output values in parallel. There is an implicit +assumption that ``programCount`` divides the ``count`` parameter without +remainder; the more general case case can be handled with a small amount of +additional code. + +To load the ``programCount``-worth of values, the program computes an index +using the sum of ``i``, which gives the first value to work on in this +iteration, and ``programIndex``, which gives a unique integer identifier +for each running program instance, counting from zero. Thus, the load from +``vin`` loads the values at offset ``i+0``, ``i+1``, ``i+2``, ..., from the +``vin`` array into the vector variable ``v``. This general idiom should be +familiar to CUDA\* or OpenCL\* programmers, where thread ids serve a +similar role to ``programIndex`` in ``ispc``. See the section `Mapping +Data to Program Instances`_ for more detail. + +The program can then proceed, doing computation and control flow based on +the values loaded. The result from the running program instances is +written to the ``vout`` array before the next loop iteration runs. + +For a simple program like this one, the performance difference versus a +regular scalar C/C++ implementation are minimal. For more +complex programs that do more substantial amounts of computation, doing +that computation in parallel across the machine's SIMD lanes can have a +substantial performance benefit. + +On Linux\* and Mac OS\*, the makefile in that directory compiles this program. +For Windows\*, open the ``examples/examples.sln`` file in Microsoft Visual +C++ 2010\* to build this (and the other) examples. In either case, +build it now! We'll walk through the details of the compilation steps in +the following section, `Using The ISPC Compiler`_.) In addition to +compiling the ``ispc`` program, in this case the ``ispc`` compiler also +generates a small header file, ``simple.h``. This header file includes the +declaration for the C-callable function that the above ``ispc`` program is +compiled to. The relevant parts of this file are: + +:: + + #ifdef __cplusplus + extern "C" { + #endif // __cplusplus + extern void simple(float vin[], float vout[], int32_t count); + #ifdef __cplusplus + } + #endif // __cplusplus + +It's not mandatory to ``#include`` the generated header file in your C/C++ +code (you can alternatively use a manually-written ``extern`` declaration +of the ``ispc`` functions you use), but it's a helpful check to ensure that +the function signatures are as expected on both sides. + +Here is the main program, ``simple.cpp``, which calls the ``ispc`` function +above. + +:: + + #include + #include "simple.h" + + int main() { + float vin[16], vout[16]; + for (int i = 0; i < 16; ++i) + vin[i] = i; + + simple(vin, vout, 16); + + for (int i = 0; i < 16; ++i) + printf("%d: simple(%f) = %f\n", i, vin[i], vout[i]); + } + +Note that the call to the ``ispc`` function in the middle of ``main()`` is +a regular function call. (And it has the same overhead as a C/C++ function +call, for that matter.) + +When the executable ``simple`` runs, it generates the expected output: + +:: + + 0: simple(0.000000) = 0.000000 + 1: simple(1.000000) = 1.000000 + 2: simple(2.000000) = 4.000000 + 3: simple(3.000000) = 1.732051 + ... + +There is also a small example of using ``ispc`` to compute the Mandelbrot +set; see the `Mandelbrot set example`_ page on the ``ispc`` website for a +walkthrough of it. + +.. _Mandelbrot set example: http://ispc.github.com/example.html + +Using The ISPC Compiler +======================= + +To go from a ``ispc`` source file to an object file that can be linked +with application code, enter the following command + +:: + + ispc foo.ispc -o foo.o + +On Linux\* and Mac OS\*, ``ispc`` automatically runs the C preprocessor on +your input program; under Windows\*, this must be done manually. With +Microsoft Visual C++ 2010\*, the following custom build step for +``ispc`` source files takes care of this job: + +:: + + cl /E /TP %(Filename).ispc | ispc - -o %(Filename).obj -h %(Filename).h + +The ``cl`` call runs the C preprocessor on the ``ispc`` file; the result is +piped to ``ispc`` to generate an object file and a header. As an example, +see the file ``simple.vcxproj`` in the ``examples/simple`` directory of the +``ispc`` distribution. + +Command-line Options +-------------------- + +The ``ispc`` executable can be run with ``--help`` to print a list of +accepted command-line arguments. By default, the compiler compiles the +provided program (and issues warnings and errors), but doesn't +generate any output. + +If the ``-o`` flag is given, it will generate an output file (a native +object file by default). To generate a text assembly file, pass +``--emit-asm``: + +:: + + ispc foo.ispc -o foo.s --emit-asm + +To generate LLVM bitcode, use the ``--emit-llvm`` flag. + +By default, an optimized x86-64 object file tuned for Intel® Core +processors CPUs is built. You can use the ``--arch`` command line flag to +specify a 32-bit x86 target: + +:: + + ispc foo.ispc -o foo.obj --arch=x86 + +Optimizations can be turned off with ``-O0``: + +:: + + ispc foo.ispc -o foo.obj -O0 + +On Mac\* and Linux\*, there is early support for generating debugging +symbols; this is enabled with the ``-g`` command-line flag. + +The ``-h`` flag can also be used to direct ``ispc`` to generate a C/C++ +header file that includes C/C++ declarations of the C-callable ``ispc`` +functions and the types passed to it. + +On Linux\* and Mac OS\*, ``-D`` can be used to specify definitions to be +passed along to the C pre-prcessor, which runs over the program input +before it's compiled. On Windows®, pre-processor definitions should be +provided to the ``cl`` call. + +By default, the compiler generates x86-64 Intel® SSE4 code. To generate +32-bit code, you can use the the ``--arch=x86`` command-line flag. To +select Intel® SSE2, use ``--target=sse2``. + +``ispc`` supports an alternative method for generating Intel® SSE4 code, +where the program is "doubled up" and eight instances of it run in +parallel, rather than just four. For workloads that don't require large +numbers of registers, this method can lead to significantly more efficient +execution thanks to greater instruction level parallelism. This option is +selected with ``--target=sse4x2``. + +The compiler issues a number of performance warnings for code constructs +that compile to relatively inefficient code. These warnings can be +silenced with the ``--wno-perf`` flag (or by using ``--woff``, which turns +off all warnings.) + + +The ISPC Language +================= + +``ispc``'s syntax is based on C and is designed to be as similar to C +as much as possible. Between syntactic differences and the fundamentally +parallel execution model (versus C's serial model), C code is not directly +portable to ``ispc``, although starting with working C code and porting it +to ``ispc`` can be an efficient way to write ``ispc`` programs. + +Lexical Structure +----------------- + +Tokens in ``ispc`` are delimted by white-space and comments. The +white-space characters are the usual set of spaces, tabs, and carriage +returns/line feeds. Comments can be delinated with ``//``, which starts a +comment that continues to the end of the line, or the start of a comment +can be delinated with ``/*`` and the end with ``*/``. Like in C/C++, +comments can't be nested. + +Identifiers in ``ispc`` are sequences of characters that start with an +underscore or an upper-case or lower-case letter, and then followed by +zero or more letters, numbers, or underscores. + +Integer numeric constants can be specified in base 10 or in hexidecimal. +Base 10 constants are given by a sequence of one or more digits from 0 to +9. Hexidecimal constants are denoted by a leading ``0x`` and then one or +more digits from 0-9, a-f, or A-F. + +Floating-point constants can be specified in one of three ways. First, +they may be a sequence of zero or more digits from 0 to 9, followed by a +period, followed by zero or more digits from 0 to 9. (There must be at +least one digit before or after the period). + +The second option is scientific notation, where a base value is specified +as the first form of a floating-point constant but is then followed by an +"e" or "E", then a plus sign or a minus sign, and then an exponent. + +Finally, floating-point constants may be specified as hexidecimal +constants; this form can ensure a perfectly bit-accurate representation of +a particular floating-point number. These are specified with an "0x" +prefix, followed by a zero or a one, a period, and then the remainder of +the mantissa in hexidecimal form, with digits from 0-9, a-f, or A-F. The +start of the exponent is denoted by a "p", which is then followed by an +optional plus or minus sign and then digits from 0 to 9. For example: + +:: + + float two = 0x1p+1; // 2.0 + float pi = 0x1.921fb54442d18p+1; // 3.1415926535... + float neg = -0x1.ffep+11; // -4095. + +Floating-point constants can optionally have a "f" or "F" suffix (``ispc`` +currently treats all floating-point constants as having 32-bit precision, +making this suffix unnecessary.) + +String constants in ``ispc`` are denoted by an opening double quote ``"`` +followed by any character other than a newline, up to a closing double +quote. Within the string, a number of special escape sequences can be used +to specify special characters. These sequences all start with an initial +``\`` and are listed below: + +.. list-table:: Escape sequences in strings + + * - ``\\`` + - backslash: ``\`` + * - ``\"`` + - double quotation mark: ``"`` + * - ``\'`` + - single quotation mark: ``'`` + * - ``\a`` + - bell (alert) + * - ``\b`` + - backspace character + * - ``\f`` + - formfeed character + * - ``\n`` + - newline + * - ``\r`` + - carriabe return + * - ``\t`` + - horizontal tab + * - ``\v`` + - vertical tab + * - ``\`` followed by one or more digits from 0-8 + - ASCII character in octal notation + * - ``\x``, followed by one or more digits from 0-9, a-f, A-F + - ASCII character in hexidecimal notation + +``ispc`` doesn't support a string data type; string constants can be passed +as the first argument to the ``print()`` statement, however. ``ispc`` also +doesn't support character constants. + +The following identifiers are reserved as language keywords: ``bool``, +``break``, ``case``, ``cbreak``, ``ccontinue``, ``cdo``, ``cfor``, +``char``, ``cif``, ``cwhile``, ``const``, ``continue``, ``creturn``, +``default``, ``do``, ``double``, ``else``, ``enum``, ``export``, +``extern``, ``false``, ``float``, ``for``, ``goto``, ``if``, ``inline``, ``int``, +``int32``, ``int64``, ``launch``, ``print``, ``reference``, ``return``, +``signed``, ``sizeof``, ``soa``, ``static``, ``struct``, ``switch``, +``sync``, ``task``, ``true``, ``typedef``, ``uniform``, ``union``, +``unsigned``, ``varying``, ``void``, ``volatile``, ``while``. + +``ispc`` defines the following operators and punctuation: + +.. list-table:: Operators + + * - Symbols + - Use + * - ``=`` + - Assignment + * - ``+``, ``-``, \*, ``/``, ``%`` + - Arithmetic operators + * - ``&``, ``|``, ``^``, ``!``, ``~``, ``&&``, ``||``, ``<<``, ``>>`` + - Logical and bitwise operators + * - ``++``, ``--`` + - Pre/post increment/decrement + * - ``<``, ``<=``, ``>``, ``>=``, ``==``, ``!=`` + - Relational operators + * - ``*=``, ``/=``, ``+=``, ``-=``, ``<<=``, ``>>=``, ``&=``, ``|=`` + - Compound assignment operators + * - ``?``, ``:`` + - Selection operators + * - ``;`` + - Statement separator + * - ``,`` + - Expression separator + * - ``.`` + - Member access + +A number of tokens are used for grouping in ``ispc``: + +.. list-table:: Grouping Tokens + + * - ``(``, ``)`` + - Parenthesization of expressions, function calls, delimiting specifiers + for control flow constructs. + * - ``[``, ``]`` + - Array and short-vector indexing + * - ``{``, ``}`` + - Compound statements + + +Basic Types and Type Qualifiers +------------------------------- + +``ispc`` is a statically-typed language. It supports a variety of basic +types. + +* ``void``: "empty" type representing no value. +* ``bool``: boolean value; may be assigned ``true``, ``false``, or the + value of a boolean expression. +* ``int``: 32-bit signed integer; may also be specified as ``int32``. +* ``unsigned int``: 32-bit unsigned integer; may also be specified as + ``unsigned int32``. +* ``float``: 32-bit floating point value +* ``int64``: 64-bit signed integer. +* ``unsigned int64``: 64-bit unsigned integer. +* ``double``: 64-bit double-precision floating point value. + +Implicit type conversion between values of different types is done +automatically by the ``ispc`` compiler. Thus, a value of ``float`` type +can be assigned to a variable of ``int`` type directly. In binary +arithmetic expressions with mixed types, types are promoted to the "more +general" of the two types, with the following precedence: + +:: + + double > uint64 > int64 > float > uint32 > int32 > bool + +In other words, adding an ``int64`` to a ``double`` causes the ``int64`` to +be converted to a ``double``, the addition to be performed, and a +``double`` value to be returned. If a different conversion behavior is +desired, then explicit type-casts can be used, where the destination type +is provided in parenthesis around the expression: + +:: + + double foo = 1. / 3.; + int bar = (float)bar + (float)bar; // 32-bit float addition + +Note: if a ``bool`` is converted to an integer numeric type (``int``, +``int64``, etc.), then the conversion is done with sign extension, not zero +extension. Thus, the resulting value has all bits set if the ``bool`` is +``true``; for example, ``0xffffffff`` for ``int32``. This differs from C +and C++, where a ``true`` bool is converted to the integer value one. + +Variables can be declared with the ``const`` qualifier, which prohibits +their modification. + +:: + + const float PI = 3.1415926535; + +As in C, the ``extern`` qualifier can be used to declare a function or +global variable defined in another source file, and the ``static`` +qualifier can be used to define a variable or function that is only visible +in the current scope. The values of ``static`` variables declared in +functions are preserved across function calls. + +The ``typedef`` keyword can be used to name types: + +:: + + typedef Float3 float[3]; + +``typedef`` doesn't create a new type: it just provides an alternative name +for an existing type. Thus, in the above example, it is legal to pass a +value with ``float[3]`` type to a function that has been declared to take a +``Float3`` parameter. + +``ispc`` provides a ``reference`` qualifier that can be used for passing +values to functions by reference so that functions can return multiple +results or modify existing variables. + +:: + + void increment(reference float f) { + ++f; + } + +``ispc`` doesn't currently support pointer types. + + +Short Vector Types +------------------ + +``ispc`` supports a parameterized type to define short vectors. These +short vectors can only be used with basic types like ``float`` and ``int``; +they can't be applied to arrays or structures. Note: ``ispc`` does *not* +use these short vectors to facilitate program vectorization; they are +purely a syntactic convenience. Using them or writing the corresponding +code without them shouldn't lead to any noticeable performance differences +between the two approaches. + +Syntax similar to C++ templates is used to declare these types: + +:: + + float<3> foo; // vector of three floats + double<6> bar; + +The length of these vectors can be arbitrarily long, though the expected +usage model is relatively short vectors. + +You can use ``typedef`` to create types that don't carry around +the brackets around the vector length: + +:: + + typedef float<3> float3; + +``ispc`` doesn't support templates in general. In particular, +not only must the vector length be a compile-time constant, but it's +also not possible to write functions that are parameterized by vector +length. + +:: + + uniform int i = foo(); + // ERROR: length must be compile-time constant + float vec; + // ERROR: can't write functions parameterized by vector length + float func(float val); + +Arithmetic on these short vector types works as one would expect; the +operation is applied component-wise to the values in the vector. Here is a +short example: + +:: + + float<3> func(float<3> a, float<3> b) { + a += b; // add individual elements of a and b + a *= 2.; // multiply all elements of a by 2 + bool<3> test = a < b; // component-wise comparison + return test ? a : b; // return each minimum component + } + +As shown by the above code, scalar types automatically convert to +corresponding vector types when used in vector expressions. In this +example, the constant ``2.`` above is converted to a three-vector of 2s for +the multiply in the second line of the function implementation. + +Type conversion between other short vector types also works as one would +expect, though the two vector types must have the same length: + +:: + + float<3> foo = ...; + int<3> bar = foo; // ok, cast elements to ints + int<4> bat = foo; // ERROR: different vector lengths + float<4> bing = foo; // ERROR: different vector lengths + +There are two mechanisms to access the individual elements of these short +vector data types. The first is with the array indexing operator: + +:: + + float<4> foo; + for (uniform int i = 0; i < 4; ++i) + foo[i] = i; + +``ispc`` also provides a specialized mechanism for naming and accessing +the first few elements of short vectors based on an overloading of +the structure member access operator. The syntax is similar to that used +in HLSL, for example. + +:: + + float<3> position; + position.x = ...; + position.y = ...; + position.z = ...; + +More specifically, the first element of any short vector type can be +accessed with ``.x`` or ``.r``, the second with ``.y`` or ``.g``, the third +with ``.z`` or ``.b``, and the fourth with ``.w`` or ``.a``. Just like +using the array indexing operator with an index that is greater than the +vector size, accessing an element that is beyond the vector's size is +undefined behavior and may cause your program to crash. + +Note: ``ispc`` doesn't support the "swizzling" operations that languages +like HLSL do. Only a single element of the vector can be accessed at a +time with these member operators. + +:: + + float<3> foo = ...; + float<2> bar = foo.xy; // ERROR + foo.xz = ...; // ERROR + func(foo.xyx); // ERROR + +For convenience, short vectors can be initialized with a list of individual +element values: + +:: + + float x = ..., y = ..., z = ...; + float<3> pos = { x, y, z }; + + +Struct and Array Types +---------------------- + +More complex data structures can be built using ``struct`` and arrays. + +:: + + struct Foo { + float time; + int flags[10]; + }; + +The size of arrays must be a compile-time constant, though functions can be +declared to take "unsized arrays" as parameters so that arrays of any size +may be passed: + +:: + + void foo(float array[], int length); + +As in C++, after a ``struct`` is declared, an instance can be created using +the ``struct``'s name: + +:: + + Foo f; + +Alternatively, ``struct`` can be used before the structure name: + +:: + + struct Foo f; + + +Declarations and Initializers +----------------------------- + +Variables are declared and assigned just as in C: + +:: + + float foo = 0, bar[5]; + float bat = func(foo); + +If a variable is declared without an initializer expression, then its value +is undefined until a value is assigned to it. Reading an undefined +variable may lead to unexpected program behavior. + +Any variable that is declared at file scope (i.e. outside a function) is a +global variable. If a global variable is qualified with the ``static`` +keyword, then its only visible within the compilation unit in which it was +defined. As in C/C++, a variable with a ``static`` qualifier inside a +functions maintains its value across function invocations. + +Like C++, variables don't need to be declared at the start of a basic +block: + +:: + + int foo = ...; + if (foo < 2) { ... } + int bar = ...; + +Variables can also be declared in ``for`` statement initializers: + +:: + + for (int i = 0; ...) + +Arrays can be initialized with either a scalar value or with individual +element values in braces: + +:: + + int foo[10] = x; // all ten elements take the value of x + int bar[2][4] = { { 1, 2, 3, 4 }, { 5, 6, 7, 8 } }; + +Structures can also be initialized both with scalar values or with element +values in braces: + +:: + + struct Color { float r, g, b; }; + .... + Color c = 1; // all are one + Color d = { 0.5, .75, 1.0 }; // r = 0.5, ... + + +Function Declarations +--------------------- + +Functions can be declared with a number of qualifiers that affect their +visibility and capabilities. As in C/C++, functions have global visibility +by default. If a function is declared with a ``static`` qualifier, then it +is only visible in the file in which it was declared. + +Any function that can be launched with the ``launch`` construct in ``ispc`` +must have a ``task`` qualifier; see `Task Parallelism in ISPC`_ for more +discussion of launching tasks in ``ispc``. + +Functions that are intended to be called from C/C++ application code must +have the ``export`` qualifier. This causes them to have regular C linkage +and to have their declarations included in header files, if the ``ispc`` +compiler is directed to generated a C/C++ header file for the file it +compiled. + +Finally, any function defined with an ``inline`` qualifier will always be +inlined by ``ispc``; ``inline`` is not a hint, but forces inlining. The +compiler will opportunistically inline short functions depending on their +complexity, but any function that should always be inlined should have the +``inline`` qualifier. + + +Expressions +----------- + +All of the operators from C that you'd expect for writing expressions are +present. Rather than enumerating all of them, here is a short summary of +the range of them available in action. + +:: + + unsigned int i = 0x1234feed; + unsigned int j = (i << 3) ^ ~(i - 3); + i += j / 6; + float f = 1.234e+23; + float g = j * f / (2.f * i); + double h = (g < 2) ? f : g/5; + +Structure member access and array indexing also work as in C. + +:: + + struct Foo { float f[5]; int i; }; + Foo foo = { { 1,2,3,4,5 }, 2 }; + return foo.f[4] - foo.i; + + +Control Flow +------------ + +``ispc`` supports most of C's control flow constructs, including ``if``, +``for``, ``while``, ``do``. You can use ``break`` and ``continue`` +statements in ``for``, ``while``, and ``do`` loops. + +There are variants of the ``if``, ``do``, ``while``, ``for``, ``break``, +``continue``, and ``return`` statements (``cif``, ``cdo``, ``cwhile``, +``cfor``, ``cbreak``, ``ccontinue``, and ``creturn``, respectively) that +provide the compiler a hint that the control flow is expected to be +coherent at that particular point, thus allowing the compiler to do +additional optimizations for that case. These are described in the +`"Coherent" Control Flow Statements`_ section. + +``ispc`` does not support ``switch`` statements or ``goto``. + +Functions +--------- + +Like C, functions must be declared before they are called, though a forward +declaration can be used before the actual function definition. Functions +can be overloaded by parameter type. Given multiple definitions of a +function, ``ispc`` uses the following methods to try to find a match. If +a single match of a given type is found, it is used; if multiple matches of +a given type are found, an error is issued. + +* All parameter types match exactly. +* All parameter types match exactly, where any ``reference``-qualified + parameters are considered equivalent to their underlying type. +* Parameters match with only promotions from ``uniform`` to ``varying`` + type. +* Parameters match using standard type conversion (``int`` to ``float``, + ``float`` to ``int``.) + +Also like C, arrays are passed to functions by reference. + + +C Constructs not in ISPC +------------------------- + +The following C features are not available in ``ispc``. + +* ``enum`` s +* Pointers and function pointers +* ``char`` and ``short`` types +* ``switch`` statements +* bitfield members in structures +* ``union`` +* ``goto`` + + +Parallel Execution Model in ISPC +================================ + +Though ``ispc`` has C-based syntax, it is inherently a language for +parallel computation. Understanding the details of ``ispc``'s parallel +execution model is critical for writing efficient and correct programs in +``ispc``. + +``ispc`` supports both task parallelism to parallelize across multiple +cores and SPMD parallelism to parallelize across the SIMD vector lanes on a +single core. This section focuses on SPMD parallelism. See the section +`Task Parallelism in ISPC`_ for discussion of task parallelism in ``ispc``. + +The SPMD-on-SIMD Execution Model +-------------------------------- + +In the SPMD model as implemented in ``ispc``, you programs that compute a +set of outputs based on a set of inputs. You must write these +programs so that it is safe to run multiple instances of them in +parallel--i.e. given a program an a set of inputs, the programs shouldn't +have any assumptions about the order in which they will be run over the +inputs, whether one program instances will have completed before another +runs. [#]_ + +.. [#] This is essentially the same requirement that languages like CUDA\* + and OpenCL\* place on the programmer. + +Given this guarantee, the ``ispc`` compiler can safely execute multiple +program instances in parallel, across the SIMD lanes of a single CPU. In +many cases, this execution approach can achieve higher overall performance +than if the program instances had been run serially. + +Upon entry to a ``ispc`` function, the execution model switches from +the application's serial model to SPMD. Conceptually, a number of +``ispc`` program instances will start running in parallel. This +parallelism doesn't involve launching hardware threads. Rather, one +program instance is mapped to each of the SIMD lanes of the CPU's vector +unit (Intel® SSE or Intel® AVX). + +If a ``ispc`` program is written to do a the following computation: + +:: + + float x = ..., y = ...; + return x+y; + +and if the ``ispc`` program is running four-wide on a CPU that supports the +Intel® SSE instructions, then four program instances are running in +parallel, each adding a pair of scalar values. However, these four program +instances store their individual scalar values for ``x`` and ``y`` in the +lanes of an Intel® SSE vector register, so the addition operation for all +four program instances can be done in parallel with a single ``addps`` +instruction. + +Program execution is more complicated in the presence of control flow. The +details are handled by the ``ispc`` compiler, but you may find it helpful +to understand what is going on in order to be a more effective ``ispc`` +programmer. In particular, the mapping of SPMD to SIMD lanes can lead to +reductions in this SIMD efficiency as different program instances want to +perform different computations. For example, consider a simple ``if`` +statement: + +:: + + float x = ..., y = ...; + if (x < y) { + ... + } else { + ... + } + +In general, the test ``x + +Note the ``launch`` keyword and the brackets around the function call. +This code launches 100 tasks, each of which presumably does some +computation keyed off of given the value ``i``. In general, one should +launch many more tasks than there are processors in the system to +ensure good load-balancing, but not so many that the overhead of scheduling +and running tasks dominates the computation. + +Program execution continues asynchronously after task launch; thus, the +function shouldn't access values being generated by the tasks without +synchronization. A function uses a ``sync`` statement to wait for all +launched tasks to finish: + +:: + + for (uniform int i = 0; i < 100; ++i) + launch < func(a, i); > + sync; + // now safe to use computed values in a[]... + +Alternatively, any function that launches tasks has an implicit ``sync`` +before it returns, so that functions that call a function that launches +tasks don't have to worry about outstanding asynchronous computation. + +Inside functions with the ``task`` qualifier, two additional built-in +variables are provided: ``threadIndex`` and ``threadCount``. +``threadCount`` gives the total number of hardware threads that have been +launched by the task system. ``threadIndex`` provides an index between +zero and ``threadCount-1`` that gives a unique index that corresponds to +the hardware thread that is executing the current task. The +``threadIndex`` can be used for accessing data that is private to the +current thread and thus doesn't require synchronization to access under +parallel execution. + +If you use the task launch feature in ``ispc``, you must provide C/C++ +implementations of two functions and link them into your final executable +file: + +:: + + void ISPCLaunch(void *funcptr, void *data); + void ISPCSync(); + +These are called by the task launch code generated by the ``ispc`` +compiler; the first is called to launch to launch a task and the second is +called to wait for, respectively. (Factoring them out in this way +allows ``ispc`` to inter-operate with the application's task system, if +any, rather than having a separate one of its own.) To run a particular +task, the task system should cast the function pointer to a ``void (*)(void +*, int, int)`` function pointer and then call it with the provided ``void +*`` data and then an index for the current hardware thread and the total +number of hardware threads the task system has launched--in other words: + +:: + + typedef void (*TaskFuncType)(void *, int, int); + TaskFuncType tft = (TaskFuncType)(funcptr); + tft(data, threadIndex, threadCount); + +A number of sample task system implementations are provided with ``ispc``; +see the files ``tasks_concrt.cpp``, ``tasks_gcd.cpp`` and +``tasks_pthreads.cpp`` in the ``examples/mandelbrot_tasks`` directory of +the ``ispc`` distribution. + + +The ISPC Standard Library +========================= + +``ispc`` has a standard library that is automatically available when +compiling ``ispc`` programs. (To disable the standard library, pass the +``--nostdlib`` command-line flag to the compiler.) + +Math Functions +-------------- + +The math functions in the standard library provide a relatively standard +range of mathematical functionality. + +A number of different implementations of the transcendental math functions +are available; the math library to use can be selected with the +``--math-lib=`` command line argument. The following values can be provided +for this argument. + +* ``default``: ``ispc``'s default built-in math functions. These have + reasonably high precision. (e.g. ``sin`` has a maximum absolute error of + approximately 1.45e-6 over the range -10pi to 10pi.) +* ``fast``: more efficient but lower accuracy versions of the default ``ispc`` + implementations. +* ``svml``: use Intel "Short Vector Math Library". Use + ``icc`` to link your final executable so that the appropriate libraries + are linked. +* ``system``: use the system's math library. On many systems, these + functions are more accurate than both of ``ispc``'s implementations. + Using these functions may be quite + inefficient; the system math functions only compute one result at a time + (i.e. they aren't vectorized), so ``ispc`` has to call them once per + active program instance. (This is not the case for the other three + options.) + +In addition to an absolute value call, ``abs()``, ``signbits()`` extracts +the sign bit of the given value, returning ``0x80000000`` if the sign bit +is on (i.e. the value is negative) and zero if it is off. + +:: + + float abs(float a) + uniform float abs(uniform float a) + unsigned int signbits(float x) + +Standard rounding functions are provided. (On machines that support Intel® +SSE or Intel® AVX, these functions all map to variants of the ``roundss`` and +``roundps`` instructions, respectively.) + +:: + + float round(float x) + uniform float round(uniform float x) + float floor(float x) + uniform float floor(uniform float x) + float ceil(float x) + uniform float ceil(uniform float x) + +``rcp()`` computes an approximation to ``1/v``. The amount of error is +different on different architectures. + +:: + + float rcp(float v) + uniform float rcp(uniform float v) + +The square root of a given value can be computed with ``sqrt()``, which +maps to hardware square root intrinsics when available. An approximate +reciprocal square root, ``1/sqrt(v)`` is computed by ``rsqrt()``. Like +``rcp()``, the error from this call is different on different +architectures. + +:: + + float sqrt(float v) + uniform float sqrt(uniform float v) + float rsqrt(float v) + uniform float rsqrt(uniform float v) + +A standard set of minimum and maximum functions is available. These +functions also map to corresponding intrinsic functions. + +:: + + float min(float a, float b) + uniform float min(uniform float a, uniform float b) + float max(float a, float b) + uniform float max(uniform float a, uniform float b) + unsigned int min(unsigned int a, unsigned int b) + uniform unsigned int min(uniform unsigned int a, + uniform unsigned int b) + unsigned int max(unsigned int a, unsigned int b) + uniform unsigned int max(uniform unsigned int a, + uniform unsigned int b) + +The ``clamp()`` functions clamp the provided value to the given range. +(Their implementations are based on ``min()`` and ``max()`` and are thus +quite efficient.) + +:: + + float clamp(float v, float low, float high) + uniform float clamp(uniform float v, uniform float low, + uniform float high) + unsigned int clamp(unsigned int v, unsigned int low, + unsigned int high) + uniform unsigned int clamp(uniform unsigned int v, + uniform unsigned int low, + uniform unsigned int high) + +``ispc`` provides a standard variety of calls for trigonometric functions: + +:: + + float sin(float x) + uniform float sin(uniform float x) + float cos(float x) + uniform float cos(uniform float x) + float tan(float x) + uniform float tan(uniform float x) + +Arctangent functions are also available: + +:: + + float atan(float x) + float atan2(float x, float y) + uniform float atan(uniform float x) + uniform float atan2(uniform float x, uniform float y) + +If both sine and cosine are needed, then the ``sincos()`` call computes +both more efficiently than two calls to the respective individual +functions: + +:: + + void sincos(float x, reference float s, reference float c) + void sincos(uniform float x, uniform reference float s, + uniform reference float c) + + +The usual exponential and logarithmic functions are provided. + +:: + + float exp(float x) + uniform float exp(uniform float x) + float log(float x) + uniform float log(uniform float x) + float pow(float a, float b) + uniform float pow(uniform float a, uniform float b) + +Some functions that end up doing low-level manipulation of the +floating-point representation in memory are available. As in the standard +math library, ``ldexp()`` multiplies the value ``x`` by 2^n, and +``frexp()`` directly returns the normalized mantissa and returns the +normalized exponent as a power of two in the ``pw2`` parameter. + +:: + + float ldexp(float x, int n) + uniform float ldexp(uniform float x, uniform int n) + float frexp(float x, reference int pw2) + niform float frexp(uniform float x, + reference uniform int pw2) + + +A simple random number generator is provided. State for the RNG +is maintained in an instance of the ``RNGState`` structure, which is seeded +with ``seed_rng()``. + +:: + + struct RNGState; + unsigned int random(reference uniform RNGState state) + float frandom(reference uniform RNGState state) + void seed_rng(reference uniform RNGState state, + uniform int seed) + +Output Functions +---------------- + +``ispc`` has a simple ``print`` statement for printing values during +program execution. In the following short ``ispc`` program, there are +three uses of the ``print`` statement: + +:: + + export void foo(uniform float f[4], uniform int i) { + float x = f[programIndex]; + print("i = %, x = %\n", i, x); + if (x < 2) { + ++x; + print("added to x = %\n", x); + } + print("last print of x = %\n", x); + } + +There are a few things to note. First, the function is called ``print``, +not ``printf`` (unlike C). Second, the formatting string passed to this +function only uses a single percent sign to denote where the corresponding +value should be printed. You don't need to match the types of formatting +operators with the types being passed. However, you can't currently use +the rich data formatting options that ``printf`` provides (e.g. constructs +like ``%.10f``.). + +If this function is called with the array of floats (0,1,2,3) passed in for +the ``f`` parameter and the value ``10`` for the ``i`` parameter, it +generates the following output on a four-wide compilation target: + +:: + + i = 10, x = [0.000000,1.000000,2.000000,3.000000] + added to x = [1.000000,2.000000,_________,_________] + last print of x = [1.000000,2.000000,2.000000,3.000000] + +All values of "varying" variables for each executing program instance is +printed when a "varying" variable is printed. The result from the second +print statement, which was called under control flow in the function +``foo()`` above, and given the input array (0,1,2,3), only includes the +first two program instances entered the ``if`` block. Therefore, the +values for the inactive program instances aren't printed. (In other cases, +they may have garbage values or be otherwise undefined.) + + +Cross-Lane Operations +--------------------- + +Usually, ``ispc`` code expresses independent computation on separate data +elements. There are, however, a number of cases where it's useful for the +program instances to be able to cooperate in computing results. The +cross-lane operations described in this section provide primitives for +communication between the running program instances. + +A few routines that evaluate conditions across the running program +instances. For example, ``any()`` returns ``true`` if the given value +``v`` is ``true`` for any of the SPMD program instances currently running, +and ``all()`` returns ``true`` if it true for all of them. + +:: + + uniform bool any(bool v) + uniform bool all(bool v) + +The various variants of ``popcnt()`` return the population count--the +number of bits set in the given value. + +:: + + uniform int popcnt(uniform int v) + int popcnt(int v) + uniform int popcnt(bool v) + +The ``lanemask()`` function returns an integer that encodes which of the +current SPMD program instances are currently executing. The i'th bit is +set if the i'th SIMD lane is currently active. + +:: + + uniform int lanemask() + +You can compute reductions across the program instances. For example, the +values in each of the SIMD lanes ``x`` are added together by +``reduce_add()``. If this function is called under control flow, it only +adds the values for the currently active program instances. + +:: + + uniform float reduce_add(float x) + uniform int reduce_add(int x) + uniform unsigned int reduce_add(unsigned int x) + +You can also use functions to compute the minimum and maximum value of the +given value across all of the currently-executing vector lanes. + +:: + + uniform float reduce_min(float a, float b) + uniform int reduce_min(int a, int b) + uniform unsigned int reduce_min(unsigned int a, unsigned int b) + uniform float reduce_max(float a, float b) + uniform int reduce_max(int a, int b) + uniform unsigned int reduce_max(unsigned int a, unsigned int b) + + +Finally, there are routines for writing out and reading in values from +linear memory locations for the active program instances. +``packed_load_active()`` loads consecutive values from the given array, +starting at ``a[offset]``, loading one value for each currently-executing +program instance and storing it into that program instance's ``val`` +variable. It returns the total number of values loaded. Similarly, +``packed_store_active()`` stores the ``val`` values for each program +instances that executed the ``packed_store_active()`` call, storing the +results into the given array starting at the given offset. It returns the +total number of values stored. + +:: + + uniform unsigned int packed_load_active(uniform int a[], + uniform int offset, + reference int val) + uniform unsigned int packed_store_active(uniform int a[], + uniform int offset, + int val) + + +As an example of how these functions can be used, the following code shows +the use of ``packed_store_active()``. The program instances that are +executing each compute some value ``x``; we'd like to record the program +index values of the program instances for which ``x`` is less than zero, if +any. In following the code, the ``programIndex`` value for each program +instance is written into the ``ids`` array only if ``x < 0`` for that +program instance. The total number of values written into ``ids`` is +returned from ``packed_store_active()``. + +:: + + uniform int ids[100]; + uniform int offset = 0; + float x = ...; + if (x < 0) + offset += packed_store_active(ids, offset, programIndex); + + +Finally, there are primitive operations that extract and set values in the +SIMD lanes. You can implement all of the operations described +above in this section from these routines, though in general, not as +efficiently. These routines are useful for implementing other reductions +and cross-lane communication that isn't included in the above, though. +Given a ``varying`` value, ``extract()`` returns the i'th element of it as +a single ``uniform`` value. Similarly, ``insert`` returns a new value +where the ``i`` th element of ``x`` has been replaced with the value ``v`` +. + +:: + + uniform float extract(float x, uniform int i) + uniform int extract(int x, uniform int i) + float insert(float x, uniform int i, uniform float v) + int insert(int x, uniform int i, uniform int v) + + +Low-Level Bits +-------------- + +``ispc`` provides a number of bit/memory-level utility routines in its +standard library as well. It has routines that load from and store +to 8-bit and 16-bit integer values stored in memory, converting to and from +32-bit integers for use in computation in ``ispc`` code. (These functions +and this conversion step are necessary because ``ispc`` doesn't have native +8-bit or 16-bit types in the language.) + +:: + + unsigned int load_from_int8(uniform int a[], + uniform int offset) + void store_to_int8(uniform int a[], uniform int offset, + unsigned int val) + unsigned int load_from_int16(uniform int a[], + uniform int offset) + void store_to_int16(uniform int a[], uniform int offset, + unsigned int val) + +There are two things to note in these functions. First, note that these +functions take ``unsigned int`` arrays as parameters; you need +to cast `the ``int8_t`` and ``int16_t`` pointers from the C/C++ side to +``unsigned int`` when passing them to ``ispc`` code. Second, although the +arrays are passed as ``unsigned int``, in the array indexing calculation, +with the ``offset`` parameter, they are treated as if they were ``int8`` or +``int16`` types. (i.e. the offset treated as being in terms of number of 8 +or 16-bit elements.) + +The ``intbits()`` and ``floatbits()`` functions can be used to implement +low-level floating-point bit twiddling. For example, ``intbits()`` returns +an ``unsigned int`` that is a bit-for-bit copy of the given ``float`` +value. (Note: it is **not** the same as ``(int)a``, but corresponds to +something like ``*((int *)&a)`` in C. + +:: + + float floatbits(unsigned int a); + uniform float floatbits(uniform unsigned int a); + unsigned int intbits(float a); + uniform unsigned int intbits(uniform float a); + + +The ``intbits()`` and ``floatbits()`` functions have no cost at runtime; +they just let the compiler know how to interpret the bits of the given +value. They make it possible to efficiently write functions that take +advantage of the low-level bit representation of floating-point values. + +For example, the ``abs()`` function in the standard library is implemented +as follows: + +:: + + float abs(float a) { + unsigned int i = intbits(a); + i &= 0x7fffffff; + return floatbits(i); + } + +It, it clears the high order bit, to ensure that the given floating-point +value is positive. This compiles down to a single ``andps`` instruction +when used with an Intel® SSE target, for example. + + +Interoperability with the Application +===================================== + +One of ``ispc``'s key goals is to make it easy to interoperate between the +C/C++ application code and parallel code written in ``ispc``. This +section describes the details of how this works and describes a number of +the pitfalls. + +Interoperability Overview +------------------------- + +As described in `Compiling and Running a Simple ISPC Program`_ it's +relatively straightforward to call ``ispc`` code from C/C++. First, any +``ispc`` functions to be called should be defined with the ``export`` +keyword: + +:: + + export void foo(uniform float a[]) { + ... + } + + +This function corresponds to the following C-callable function: + +:: + + void foo(float a[]); + + +(Recall from the `Uniform and Varying Qualifiers`_ section +that ``uniform`` types correspond to a single instances of the +corresponding type in C/C++.) + +In addition to variables passed from the application to ``ispc`` in the +function call, you can also share global variables between the application +and ``ispc``. To do so, just declare the global variable as usual (in +either ``ispc`` or application code), and add an ``extern`` declaration on +the other side. + +For example, given this ``ispc`` code: + +:: + + // ispc code + uniform float foo; + extern uniform float bar[10]; + +And this C++ code: + +:: + + // C++ code + extern float foo; + float bar[10]; + +Both the ``foo`` and ``bar`` global variables can be accessed on each +side. + +``ispc`` code can also call back to C/C++. On the ``ispc`` side, any +application functions to be called must be declared with the ``export "C"`` +qualifier. + +:: + + extern "C" void foo(uniform float f, uniform float g); + +Unlike in C++, ``export "C"`` doesn't take braces to delineate +multiple functions to be declared; thus, multiple C functions to be called +from ``ispc`` must be declared as follows: + +:: + + extern "C" void foo(uniform float f, uniform float g); + extern "C" uniform int bar(uniform int a); + +It is illegal to overload functions declared with ``extern "C"`` linkage; +``ispc`` issues an error in this case. + +Function calls back to C/C++ are not made if none of the program instances +want to make the call. For example, given code like: + +:: + + uniform float foo = ...; + float x = ...; + if (x != 0) + foo = appFunc(foo); + + +``appFunc()`` will only be called if one or more of the running program +instances evaluates ``true`` for ``x != 0``. If the application code would +like to determine which of the running program instances want to make the +call, a mask representing the active SIMD lanes can be passed to the +function. + +:: + + extern "C" float appFunc(uniform float x, + uniform int activeLanes); + +If the function is then called as: + +:: + + ... + x = appFunc(x, lanemask()); + +The ``activeLanes`` parameter will have the value one in the 0th bit if the +first program instance is running at this point in the code, one in the +first bit for the second instance, and so forth. (The ``lanemask()`` +function is documented in `Low-Level Bits`_.) Application code can thus be +written as: + +:: + + float appFunc(float x, int activeLanes) { + for (int i = 0; i < programCount; ++i) + if ((activeLanes & (1 << i)) != 0) { + // do computation for i'th SIMD lane + } + } + + +Data Layout +----------- + +In general, ``ispc`` tries to ensure that ``struct`` s and other complex +datatypes are laid out in the same way in memory as they are in C/C++. +Matching alignment is important for easy interoperability between C/C++ +code and ``ispc`` code. + +The main complexity in sharing data between ``ispc`` and C/C++ often comes +from reconciling data structures between ``ispc`` code and application +code; it can be useful to declare the shared structures in ``ispc`` code +and then examine the generated header file (which will have the C/C++ +equivalents of them.) For example, given a structure in ``ispc``: + +:: + + // ispc code + struct Node { + uniform int count; + uniform float pos[3]; + }; + +If the ``Node`` structure is used in the parameters to an ``export`` ed +function, then the header file generated by the ``ispc`` compiler will +have a declaration like: + +:: + + // C/C++ code + struct Node { + int count; + float pos[3]; + }; + +Because ``varying`` types have different sizes on different processor +architectures, ``ispc`` prohibits any varying types from being used in +parameters to functions with the ``export`` qualifier. (``ispc`` also +prohibits passing structures that themselves have varying types as members, +etc.) Thus, all datatypes that is shared with the application must have +the ``uniform`` qualifier applied to them. (See `Understanding How to +Interoperate With the Application's Data`_ for more discussion of how to +load vectors of SoA or AoSoA data from the application.) + +While ``ispc`` doesn't support pointers, there are two mechanisms to work +with pointers to arrays from the application. First, ``ispc`` passes +arrays by reference (like C), if the application has allocated an array by: + +:: + + // C++ code + float *array = new float[count]; + +It can pass ``array`` to a ``ispc`` function defined as: + +:: + + export void foo(uniform float array[], uniform int count) + +(Though the pointer must be aligned to the compilation target's natural +vector width; see the discussion of alignment restrictions in `Data +Alignment and Aliasing`_ and the aligned allocation routines in +``examples/options/options.cpp`` for example.) + +Similarly, ``struct`` s from the application can have embedded pointers. +This is handled with similar ``[]`` syntax: + +:: + + // C code + struct Foo { + float *foo, *bar; + }; + +On the ``ispc`` side, the corresponding ``struct`` declaration is: + +:: + + // ispc + struct Foo { + uniform float foo[], bar[]; + }; + +There are two subtleties related to data layout to be aware of. First, the +C++ specification doesn't define the size or memory layout of ``bool`` s. +Therefore, it's dangerous to share ``bool`` values in memory between +``ispc`` code and C/C++ code. + +Second, ``ispc`` stores ``uniform`` short-vector types in memory with their +first element at the machine's natural vector alignment (i.e. 16 bytes for +a target that is using Intel® SSE, and so forth.) This implies that these +types will have different layout on different compilation targets. As +such, applications should in general avoid accessing ``uniform`` short +vector types from C/C++ application code if possible. + +Data Alignment and Aliasing +--------------------------- + +There are two important constraints that must be adhered to when passing +pointers from the application to ``ispc`` programs. + +The first constraint is alignment: any pointers from the host program that +are passed to ``ispc`` must be aligned to natural vector alignment of +system--for example, 16 byte alignment on a target that supports Intel® +SSE, 32-byte on an Intel® AVX target. If this constraint isn't met, the +program may abort at runtime with an unaligned memory access error. + +For example, in a ``ispc`` function with the following declaration: + +:: + + export void foo(uniform float in[], uniform float out[], + int count); + +If the application is passing stack-allocated arrays for ``in`` and +``out``, these C/C++ compiler must be told to align these arrays. + +:: + + // MSVC, SSE target + __declspec(align(16)) float in[16], out[16]; + foo(in, out, 16); + +With the gcc/clang compilers, the syntax for providing alignment is +slightly different: + +:: + + float x[16] __attribute__ ((__align__(16))); + foo(in, out, 16); + +If the data being passed is dynamically allocated, the appropriate system +aligned memory allocation routine should be used to allocate it (for +example, ``_aligned_malloc()`` with Windows\*, ``memalign()`` with +Linux\*; see the ``AllocAligned()`` function in ``examples/rt/rt.cpp`` for +an example.) + +It is also required that it be valid to read memory at the first element of +any array that is passed to ``ispc``. In practice, this should just +happen naturally, but it does mean that it is illegal to pass a ``NULL`` +pointer as a parameter to a ``ispc`` function called from the application. + +The second key constraint is that pointers and references in ``ispc`` +programs must not alias. The ``ispc`` compiler assumes that different +pointers can't end up pointing to the same memory location, either due to +having the same initial value, or through array indexing in the program as +it executed. + +This aliasing constraint also applies to ``reference`` parameters to +functions. Given a function like: + +:: + + void func(reference int a, reference int b) { + a = 0; + if (b == 0) { ... } + } + +Then if the same variable must not be passed to ``func()``. This is +another case of aliasing, and if the caller calls the function as ``func(x, +x)``, it's not guaranteed that the ``if`` test will evaluate to true, due +to the compiler's requirement of no aliasing. + +(In the future, ``ispc`` will have the ability to work with unaligned +memory as well as have a mechanism to indicate that pointers may alias.) + +Using ISPC Effectively +====================== + +Restructuring Existing Programs to Use ISPC +------------------------------------------- + +``ispc`` is designed to enable you to incorporate +SPMD parallelism into existing code with minimal modification; features +like the ability to share memory and data structures betwen C/C++ and +``ispc`` code and the ability to directly call back and forth between +``ispc`` and C/C++ are motivated by this. These features also make it +easy to incrementally transform a program to use ``ispc``; the most +computationally-intensive localized parts of the computation can be +transformed into ``ispc`` code while the remainder of the system is left +as is. + +For a given section of code to be transitioned to run in ``ispc``, the +next question is how to parallelize the computation. Generally, there will +be obvious loops inside which a large amount of computation is done ("for +each ray", "for each pixel", etc.) Mapping these to the SPMD computational +style is often effective. + +Carefully choose how to do the exact mapping of computation to SPMD program +instances. This choice can impact the mix of gather/scatter memory access +versus coherent memory access, for example. (See more on this in the +section `Gather and Scatter`_ below.) This decision can also impact the +coherence of control flow across the running SPMD program instances, which +can also have a significant effect on performance; in general, creating +groups of work that will tend to do similar computation across the SPMD +program instances improves performance. + +Understanding How to Interoperate With the Application's Data +------------------------------------------------------------- + +One of ``ispc``'s key goals is to be able to interoperate with the +application's data, in whatever layout it is stored in. You don't need to +worry about reformatting of data or the overhead of a driver model that +abstracts the data layout. This section illustrates some of the +alternatives with a simple example of computing the length of a large +number of vectors. + +Consider for starters a ``Vector`` data-type, defined in C as: + +:: + + struct Vector { float x, y, z; }; + +We might have (still in C) an array of ``Vector`` s defined like this: + +:: + + Vector vectors[1024]; + +This is called an "array of structures" (AoS) layout. To compute the +lengths of these vectors in parallel, you can write ``ispc`` code like +this: + +:: + + export void length(Vector vectors[1024], uniform float len[]) { + for (uniform int i = 0; i < 1024; i += programCount) { + int index = i+programIndex; + float x = vectors[index].x; + float y = vectors[index].y; + float z = vectors[index].z; + float l = sqrt(x*x + y*y + z*z); + len[index] = l; + } + } + +The ``vectors`` array has been indexed using ``programIndex`` in +order to "peel off" ``programCount`` worth of values to compute the length +of each time through the loop. + +The problem with this implementation is that the indexing into the array of +structures, ``vectors[index].x`` is relatively expensive. On a target +machine that supports four-wide Intel® SSE, this turns into four loads of +single ``float`` values from non-contiguous memory locations, which are +then packed into a four-wide register corresponding to ``float x``. Once the +values are loaded into the local ``x``, ``y``, and ``z`` variables, +SIMD-efficient computation can proceed; getting to that point is +relatively inefficient. + +An alternative would be the "structure of arrays" (SoA) layout. In C, the +data would be declared as: + +:: + + float x[1024], y[1024], z[1024]; + +The ``ispc`` code might be: + +:: + + export void length(uniform float x[1024], uniform float y[1024], + uniform float z[1024], uniform float len[]) { + for (uniform int i = 0; i < 1024; i += programCount) { + int index = i+programIndex; + float xx = x[index]; + float yy = y[index]; + float zz = z[index]; + float l = sqrt(xx*xx + yy*yy + zz*zz); + len[index] = l; + } + } + +In this example, the loads into ``xx``, ``yy``, and ``zz`` are single +vector loads of ``programCount`` values into the corresponding registers. +This processing is more efficient than the multiple scalar loads that are +required with the AoS layout above. + +A final alternative is "array of structures of arrays" (AoSoA), a hybrid +between these two. A structure is declared that stores a small number of +``x``, ``y``, and ``z`` values in contiguous memory locations: + +:: + + struct Vector16 { + float x[16], y[16], z[16]; + }; + + +The ``ispc`` code has an outer loop over ``Vector16`` elements and +then an inner loop that peels off values from the element members: + +:: + + #define N_VEC (1024/16) + export void length(Vector16 v[N_VEC], uniform float len[]) { + for (uniform int i = 0; i < N_VEC; ++i) { + for (uniform int j = 0; j < 16; j += programCount) { + int index = j + programIndex; + float x = v[i].x[index]; + float y = v[i].y[index]; + float z = v[i].z[index]; + float l = sqrt(x*x + y*y + z*z); + len[index] = l; + } + } + } + +(This code assumes that ``programCount`` divides 16 equally. See below for +discussion of the more general case.) One advantage of the AoSoA layout is +that the memory accesses to load values are to nearby memory locations, +where as with SoA, each of the three loads above is to locations separated +by a few thousand bytes. Thus, AoSoA can be more cache friendly. For +structures with many members, this difference can lead to a substantial +improvement. + +``ispc`` can also efficiently process data in AoSoA layout where the inner +array length is less than the machine vector width. For example, consider +doing computation with this AoSoA structure definition on a machine with an +8-wide vector unit (for example, an Intel® AVX target): + +:: + + struct Vector4 { + float x[4], y[4], z[4]; + }; + + +The ``ispc`` code to process this loads elements four at a time from +``Vector4`` instances until it has a full ``programCount`` number of +elements to work with and then proceeds with the computation. + +:: + + #define N_VEC (1024/4) + export void length(Vector4 v[N_VEC], uniform float len[]) { + for (uniform int i = 0; i < N_VEC; i += programCount / 4) { + float x, y, z; + for (uniform int j = 0; j < programCount / 4; ++j) { + if (programIndex >= 4 * j && + programIndex < 4 * (j+1)) { + int index = (programIndex & 0x3); + x = v[i+j].x[index]; + y = v[i+j].y[index]; + z = v[i+j].z[index]; + } + } + float l = sqrt(x*x + y*y + z*z); + len[4*i + programIndex] = l; + } + } + + +Communicating Between SPMD Program Instances +-------------------------------------------- + +The ``programIndex`` built-in variable (see `Mapping Data To Program +Instances`_) can be used to communicate between the set of executing +program instances. Consider the following code, which shows all of the +program instances writing into unique locations in an array. + +:: + + float x = ...; + uniform float allX[programCount]; + allX[programIndex] = x; + +In this code, a program instance that reads ``allX[0]`` finds the value of +``x`` that was computed by the first of the running program instances, and +so forth. Program instances can communicate with their neighbor instances +with indexing like ``allX[(programIndex+1)%programCount]``. + + +Gather and Scatter +------------------ + +The CPU is a poor fit for SPMD execution in some ways, the worst of which +is handling of general memory reads and writes from SPMD program instances. +For example, in a "simple" array index: + +:: + + int i = ....; + uniform float x[10] = { ... }; + float f = x[i]; + +Since the index ``i`` is a varying value, the various SPMD program +instances will in general be reading different locations in the array +``x``. Because the CPU doesn't have a gather instruction, the ``ispc`` +compiler has to serialize these memory reads, performing a separate memory +load for each running program instance, packing the result into ``f``. +(And the analogous case would happen for a write into ``x[i]``.) + +In many cases, gathers like these are unavoidable; the running program +instances just need to access incoherent memory locations. However, if the +array index ``i`` could actually be declared and used as a ``uniform`` +variable, the resulting array index is substantially more +efficient. This is another case where using ``uniform`` whenever applicable +is of benefit. + +In some cases, the ``ispc`` compiler is able to deduce that the memory +locations accessed are either all the same or are uniform. For example, +given: + +:: + + uniform int x = ...; + int y = x; + return array[y]; + +The compiler is able to determine that all of the program instances are +loading from the same location, even though ``y`` is not a ``uniform`` +variable. In this case, the compiler will transform this load to a regular vector +load, rather than a general gather. + +Sometimes the running program instances will access a +linear sequence of memory locations; this happens most frequently when +array indexing is done based on the built-in ``programIndex`` variable. In +many of these cases, the compiler is also able to detect this case and then +do a vector load. For example, given: + +:: + + uniform int x = ...; + return array[2*x + programIndex]; + +A regular vector load is done from array, starting at offset ``2*x``. + +Low-level Vector Tricks +----------------------- + +Many low-level Intel® SSE coding constructs can be implemented in ``ispc`` +code. For example, the following code efficiently reverses the sign of the +given values. + +:: + + float flipsign(float a) { + unsigned int i = intbits(a); + i ^= 0x80000000; + return floatbits(i); + } + +This code compiles down to a single XOR instruction. + +Debugging +--------- + +Support for debugging in ``ispc`` is in progress. On Linux\* and Mac +OS\*, the ``-g`` command-line flag can be supplied to the compiler, +which causes it to generate debugging symbols. Running ``ispc`` programs +in the debugger, setting breakpoints, printing out variables and the like +all generally works, though there is occasional unexpected behavior. + +Another option for debugging (the only current option on Windows\*) is +to use the ``print`` statement for ``printf()`` +style debugging. You can also use the ability to call back to +application code at particular points in the program, passing a set of +variable values to be logged or otherwise analyzed from there. + +The "Fast math" Option +---------------------- + +``ispc`` has a ``--fast-math`` command-line flag that enables a number of +optimizations that may be undesirable in code where numerical preceision is +critically important. For many graphics applications, the +approximations may be acceptable. The following two optimizations are +performed when ``--fast-math`` is used. By default, the ``--fast-math`` +flag is off. + +* Expressions like ``x / y``, where ``y`` is a compile-time constant, are + transformed to ``x * (1./y)``, where the inverse value of ``y`` is + precomputed at compile time. + +* Expressions like ``x / y``, where ``y`` is not a compile-time constant, + are transformed to ``x * rcp(y)``, where ``rcp()`` maps to the + approximate reciprocal instruction from the standard library. + + +"Inline" Aggressively +--------------------- + +Inlining functions aggressively is generally beneficial for performance +with ``ispc``. Definitely use the ``inline`` qualifier for any short +functions (a few lines long), and experiment with it for longer functions. + +Small Performance Tricks +------------------------ + +Performance is slightly improved by declaring variables at the same block +scope where they are first used. For example, in code like the +following, if the lifetime of ``foo`` is only within the scope of the +``if`` clause, write the code like this: + +:: + + float func() { + .... + if (x < y) { + float foo; + ... use foo ... + } + } + +Try not to write code as: + +:: + + float func() { + float foo; + .... + if (x < y) { + ... use foo ... + } + } + +Doing so can reduce the amount of masked store instructions that the +compiler needs to generate. + +Instrumenting Your ISPC Programs +-------------------------------- + +``ispc`` has an optional instrumentation feature that can help you +understand performance issues. If a program is compiled using the +``--instrument`` flag, the compiler emits calls to a function with the +following signature at various points in the program (for +example, at interesting points in the control flow, when scatters or +gathers happen.) + +:: + + extern "C" { + void ISPCInstrument(const char *fn, const char *note, + int line, int mask); + } + +This function is passed the file name of the ``ispc`` file running, a short +note indicating what is happening, the line number in the source file, and +the current mask of active SPMD program lanes. You must provide an +implementation of this function and link it in with your application. + +For example, when the ``ispc`` program runs, this function might be called +as follows: + +:: + + ISPCInstrument("foo.ispc", "function entry", 55, 0xf); + +This call indicates that at the currently executing program has just +entered the function defined at line 55 of the file ``foo.ispc``, with a +mask of all lanes currently executing (assuming a four-wide Intel® SSE +target machine). + +For a fuller example of the utility of this functionality, see +``examples/aobench_instrumented`` in the ``ispc`` distribution. Ths +example includes an implementation of the ``ISPCInstrument`` function that +collects aggregate data about the program's execution behavior. + +When running this example, you will want to direct to the ``ao`` executable +to generate a low resolution image, because the instrumentation adds +substantial execution overhead. For example: + +:: + + % ./ao 1 32 32 + +After the ``ao`` program exits, a summary report along the following lines +will be printed. In the first few lines, you can see how many times a few +functions were called, and the average percentage of SIMD lanes that were +active upon function entry. + +:: + + ao.ispc(0067) - function entry: 342424 calls (0 / 0.00% all off!), 95.86% active lanes + ao.ispc(0067) - return: uniform control flow: 342424 calls (0 / 0.00% all off!), 95.86% active lanes + ao.ispc(0071) - function entry: 1122 calls (0 / 0.00% all off!), 97.33% active lanes + ao.ispc(0075) - return: uniform control flow: 1122 calls (0 / 0.00% all off!), 97.33% active lanes + ao.ispc(0079) - function entry: 10072 calls (0 / 0.00% all off!), 45.09% active lanes + ao.ispc(0088) - function entry: 36928 calls (0 / 0.00% all off!), 97.40% active lanes + ... + +Disclaimer and Legal Information +================================ + +INFORMATION IN THIS DOCUMENT IS PROVIDED IN CONNECTION WITH INTEL(R) PRODUCTS. +NO LICENSE, EXPRESS OR IMPLIED, BY ESTOPPEL OR OTHERWISE, TO ANY INTELLECTUAL +PROPERTY RIGHTS IS GRANTED BY THIS DOCUMENT. EXCEPT AS PROVIDED IN INTEL'S TERMS +AND CONDITIONS OF SALE FOR SUCH PRODUCTS, INTEL ASSUMES NO LIABILITY WHATSOEVER, +AND INTEL DISCLAIMS ANY EXPRESS OR IMPLIED WARRANTY, RELATING TO SALE AND/OR USE +OF INTEL PRODUCTS INCLUDING LIABILITY OR WARRANTIES RELATING TO FITNESS FOR A +PARTICULAR PURPOSE, MERCHANTABILITY, OR INFRINGEMENT OF ANY PATENT, COPYRIGHT +OR OTHER INTELLECTUAL PROPERTY RIGHT. + +UNLESS OTHERWISE AGREED IN WRITING BY INTEL, THE INTEL PRODUCTS ARE NOT DESIGNED +NOR INTENDED FOR ANY APPLICATION IN WHICH THE FAILURE OF THE INTEL PRODUCT COULD +CREATE A SITUATION WHERE PERSONAL INJURY OR DEATH MAY OCCUR. + +Intel may make changes to specifications and product descriptions at any time, +without notice. Designers must not rely on the absence or characteristics of any +features or instructions marked "reserved" or "undefined." Intel reserves these +for future definition and shall have no responsibility whatsoever for conflicts +or incompatibilities arising from future changes to them. The information here +is subject to change without notice. Do not finalize a design with this +information. + +The products described in this document may contain design defects or errors +known as errata which may cause the product to deviate from published +specifications. Current characterized errata are available on request. + +Contact your local Intel sales office or your distributor to obtain the latest +specifications and before placing your product order. + +Copies of documents which have an order number and are referenced in this +document, or other Intel literature, may be obtained by calling 1-800-548-4725, +or by visiting Intel's Web Site. + +Intel processor numbers are not a measure of performance. Processor numbers +differentiate features within each processor family, not across different +processor families. See http://www.intel.com/products/processor_number for +details. + +BunnyPeople, Celeron, Celeron Inside, Centrino, Centrino Atom, +Centrino Atom Inside, Centrino Inside, Centrino logo, Core Inside, FlashFile, +i960, InstantIP, Intel, Intel logo, Intel386, Intel486, IntelDX2, IntelDX4, +IntelSX2, Intel Atom, Intel Atom Inside, Intel Core, Intel Inside, +Intel Inside logo, Intel. Leap ahead., Intel. Leap ahead. logo, Intel NetBurst, +Intel NetMerge, Intel NetStructure, Intel SingleDriver, Intel SpeedStep, +Intel StrataFlash, Intel Viiv, Intel vPro, Intel XScale, Itanium, +Itanium Inside, MCS, MMX, Oplus, OverDrive, PDCharm, Pentium, Pentium Inside, +skoool, Sound Mark, The Journey Inside, Viiv Inside, vPro Inside, VTune, Xeon, +and Xeon Inside are trademarks of Intel Corporation in the U.S. and other +countries. + +* Other names and brands may be claimed as the property of others. + +Copyright(C) 2011, Intel Corporation. All rights reserved. + + +Optimization Notice +=================== + +Intel compilers, associated libraries and associated development tools may +include or utilize options that optimize for instruction sets that are +available in both Intel and non-Intel microprocessors (for example SIMD +instruction sets), but do not optimize equally for non-Intel +microprocessors. In addition, certain compiler options for Intel +compilers, including some that are not specific to Intel +micro-architecture, are reserved for Intel microprocessors. For a detailed +description of Intel compiler options, including the instruction sets and +specific microprocessors they implicate, please refer to the "Intel +Compiler User and Reference Guides" under "Compiler Options." Many library +routines that are part of Intel compiler products are more highly optimized +for Intel microprocessors than for other microprocessors. While the +compilers and libraries in Intel compiler products offer optimizations for +both Intel and Intel-compatible microprocessors, depending on the options +you select, your code and other factors, you likely will get extra +performance on Intel microprocessors. + +Intel compilers, associated libraries and associated development tools may +or may not optimize to the same degree for non-Intel microprocessors for +optimizations that are not unique to Intel microprocessors. These +optimizations include Intel® Streaming SIMD Extensions 2 (Intel® SSE2), +Intel® Streaming SIMD Extensions 3 (Intel® SSE3), and Supplemental +Streaming SIMD Extensions 3 (Intel SSSE3) instruction sets and other +optimizations. Intel does not guarantee the availability, functionality, +or effectiveness of any optimization on microprocessors not manufactured by +Intel. Microprocessor-dependent optimizations in this product are intended +for use with Intel microprocessors. + +While Intel believes our compilers and libraries are excellent choices to +assist in obtaining the best performance on Intel and non-Intel +microprocessors, Intel recommends that you evaluate other compilers and +libraries to determine which best meet your requirements. We hope to win +your business by striving to offer the best performance of any compiler or +library; please let us know if you find we do not. diff --git a/doxygen.cfg b/doxygen.cfg new file mode 100644 index 00000000..9f79b909 --- /dev/null +++ b/doxygen.cfg @@ -0,0 +1,1685 @@ +# Doxyfile 1.7.2 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (" "). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# http://www.gnu.org/software/libiconv for the list of possible encodings. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded +# by quotes) that should identify the project. + +PROJECT_NAME = "Intel SPMD Program Compiler" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. +# This could be handy for archiving the generated documentation or +# if some version control system is used. + +PROJECT_NUMBER = 1.0 + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) +# base path where the generated documentation will be put. +# If a relative path is entered, it will be relative to the location +# where doxygen was started. If left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs/doxygen + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create +# 4096 sub-directories (in 2 levels) under the output directory of each output +# format and will distribute the generated files over these directories. +# Enabling this option can be useful when feeding doxygen a huge amount of +# source files, where putting all generated files in the same directory would +# otherwise cause performance problems for the file system. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# The default language is English, other supported languages are: +# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, +# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, +# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English +# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, +# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak, +# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will +# include brief member descriptions after the members that are listed in +# the file and class documentation (similar to JavaDoc). +# Set to NO to disable this. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend +# the brief description of a member or function before the detailed description. +# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator +# that is used to form the text in various listings. Each string +# in this list, if found as the leading text of the brief description, will be +# stripped from the text and the result after processing the whole list, is +# used as the annotated text. Otherwise, the brief description is used as-is. +# If left blank, the following values are used ("$name" is automatically +# replaced with the name of the entity): "The $name class" "The $name widget" +# "The $name file" "is" "provides" "specifies" "contains" +# "represents" "a" "an" "the" + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# Doxygen will generate a detailed section even if there is only a brief +# description. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full +# path before files name in the file list and in the header files. If set +# to NO the shortest path that makes the file name unique will be used. + +FULL_PATH_NAMES = NO + +# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag +# can be used to strip a user-defined part of the path. Stripping is +# only done if one of the specified strings matches the left-hand part of +# the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the +# path to strip. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of +# the path mentioned in the documentation of a class, which tells +# the reader which header file to include in order to use a class. +# If left blank only the name of the header file containing the class +# definition is used. Otherwise one should specify the include paths that +# are normally passed to the compiler using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter +# (but less readable) file names. This can be useful if your file system +# doesn't support long names like on DOS, Mac, or CD-ROM. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen +# will interpret the first line (until the first dot) of a JavaDoc-style +# comment as the brief description. If set to NO, the JavaDoc +# comments will behave just like regular Qt-style comments +# (thus requiring an explicit @brief command for a brief description.) + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then Doxygen will +# interpret the first line (until the first dot) of a Qt-style +# comment as the brief description. If set to NO, the comments +# will behave just like regular Qt-style comments (thus requiring +# an explicit \brief command for a brief description.) + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen +# treat a multi-line C++ special comment block (i.e. a block of //! or /// +# comments) as a brief description. This used to be the default behaviour. +# The new default is to treat a multi-line C++ comment block as a detailed +# description. Set this tag to YES if you prefer the old behaviour instead. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented +# member inherits the documentation from any documented member that it +# re-implements. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce +# a new page for each member. If set to NO, the documentation of a member will +# be part of the file/class/namespace that contains it. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. +# Doxygen uses this value to replace tabs by spaces in code fragments. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that acts +# as commands in the documentation. An alias has the form "name=value". +# For example adding "sideeffect=\par Side Effects:\n" will allow you to +# put the command \sideeffect (or @sideeffect) in the documentation, which +# will result in a user-defined paragraph with heading "Side Effects:". +# You can put \n's in the value part of an alias to insert newlines. + +ALIASES = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C +# sources only. Doxygen will then generate output that is more tailored for C. +# For instance, some of the names that are used will be different. The list +# of all members will be omitted, etc. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java +# sources only. Doxygen will then generate output that is more tailored for +# Java. For instance, namespaces will be presented as packages, qualified +# scopes will look different, etc. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources only. Doxygen will then generate output that is more tailored for +# Fortran. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for +# VHDL. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given extension. +# Doxygen has a built-in mapping, but you can override or extend it using this +# tag. The format is ext=language, where ext is a file extension, and language +# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C, +# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make +# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C +# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions +# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should +# set this tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); v.s. +# func(std::string) {}). This also makes the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. +# Doxygen will parse them like normal C++ but will assume all classes use public +# instead of private inheritance when no explicit protection keyword is present. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate getter +# and setter methods for a property. Setting this option to YES (the default) +# will make doxygen replace the get and set methods by a property in the +# documentation. This will only work if the methods are indeed getting or +# setting a simple type. If this is not the case, or you want to show the +# methods anyway, you should set this option to NO. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES (the default) to allow class member groups of +# the same type (for instance a group of public functions) to be put as a +# subgroup of that type (e.g. under the Public Functions section). Set it to +# NO to prevent subgrouping. Alternatively, this can be done per class using +# the \nosubgrouping command. + +SUBGROUPING = YES + +# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum +# is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically +# be useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. + +TYPEDEF_HIDES_STRUCT = NO + +# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to +# determine which symbols to keep in memory and which to flush to disk. +# When the cache is full, less often used symbols will be written to disk. +# For small to medium size projects (<1000 input files) the default value is +# probably good enough. For larger projects a too small cache size can cause +# doxygen to be busy swapping symbols to and from disk most of the time +# causing a significant performance penality. +# If the system has enough physical memory increasing the cache will improve the +# performance by keeping more symbols in memory. Note that the value works on +# a logarithmic scale so increasing the size by one will roughly double the +# memory usage. The cache size is given by this formula: +# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0, +# corresponding to a cache size of 2^16 = 65536 symbols + +SYMBOL_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. +# Private class members and static file members will be hidden unless +# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class +# will be included in the documentation. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_STATIC tag is set to YES all static members of a file +# will be included in the documentation. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) +# defined locally in source files will be included in the documentation. +# If set to NO only classes defined in header files are included. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local +# methods, which are defined in the implementation section but not in +# the interface are included in the documentation. +# If set to NO (the default) only methods in the interface are included. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base +# name of the file that contains the anonymous namespace. By default +# anonymous namespaces are hidden. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all +# undocumented members of documented classes, files or namespaces. +# If set to NO (the default) these members will be included in the +# various overviews, but no documentation section is generated. +# This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. +# If set to NO (the default) these classes will be included in the various +# overviews. This option has no effect if EXTRACT_ALL is enabled. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all +# friend (class|struct|union) declarations. +# If set to NO (the default) these declarations will be included in the +# documentation. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any +# documentation blocks found inside the body of a function. +# If set to NO (the default) these blocks will be appended to the +# function's detailed documentation block. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation +# that is typed after a \internal command is included. If the tag is set +# to NO (the default) then the documentation will be excluded. +# Set it to YES to include the internal documentation. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate +# file names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen +# will show members with their full class and namespace scopes in the +# documentation. If set to YES the scope will be hidden. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen +# will put a list of the files that are included by a file in the documentation +# of that file. + +SHOW_INCLUDE_FILES = YES + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen +# will list include files with double quotes in the documentation +# rather than with sharp brackets. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] +# is inserted in the documentation for inline members. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen +# will sort the (detailed) documentation of file and class members +# alphabetically by member name. If set to NO the members will appear in +# declaration order. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the +# brief documentation of file, namespace and class members alphabetically +# by member name. If set to NO (the default) the members will appear in +# declaration order. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen +# will sort the (brief and detailed) documentation of class members so that +# constructors and destructors are listed first. If set to NO (the default) +# the constructors will appear in the respective orders defined by +# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. +# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO +# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the +# hierarchy of group names into alphabetical order. If set to NO (the default) +# the group names will appear in their defined order. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be +# sorted by fully-qualified names, including namespaces. If set to +# NO (the default), the class list will be sorted only by class name, +# not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the +# alphabetical list. + +SORT_BY_SCOPE_NAME = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or +# disable (NO) the todo list. This list is created by putting \todo +# commands in the documentation. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or +# disable (NO) the test list. This list is created by putting \test +# commands in the documentation. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or +# disable (NO) the bug list. This list is created by putting \bug +# commands in the documentation. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or +# disable (NO) the deprecated list. This list is created by putting +# \deprecated commands in the documentation. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional +# documentation sections, marked by \if sectionname ... \endif. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines +# the initial value of a variable or macro consists of for it to appear in +# the documentation. If the initializer consists of more lines than specified +# here it will be hidden. Use a value of 0 to hide initializers completely. +# The appearance of the initializer of individual variables and macros in the +# documentation can be controlled using \showinitializer or \hideinitializer +# command in the documentation regardless of this setting. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated +# at the bottom of the documentation of classes and structs. If set to YES the +# list will mention the files that were used to generate the documentation. + +SHOW_USED_FILES = YES + +# If the sources in your project are distributed over multiple directories +# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy +# in the documentation. The default is NO. + +SHOW_DIRECTORIES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. +# This will remove the Files entry from the Quick Index and from the +# Folder Tree View (if specified). The default is YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the +# Namespaces page. +# This will remove the Namespaces entry from the Quick Index +# and from the Folder Tree View (if specified). The default is YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command , where is the value of +# the FILE_VERSION_FILTER tag, and is the name of an input file +# provided by doxygen. Whatever the program writes to standard output +# is used as the file version. See the manual for examples. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. The create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. +# You can optionally specify a file name after the option, if omitted +# DoxygenLayout.xml will be used as the name of the layout file. + +LAYOUT_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated +# by doxygen. Possible values are YES and NO. If left blank NO is used. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated by doxygen. Possible values are YES and NO. If left blank +# NO is used. + +WARNINGS = YES + +# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings +# for undocumented members. If EXTRACT_ALL is set to YES then this flag will +# automatically be disabled. + +WARN_IF_UNDOCUMENTED = NO + +# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some +# parameters in a documented function, or documenting parameters that +# don't exist or using markup commands wrongly. + +WARN_IF_DOC_ERROR = YES + +# The WARN_NO_PARAMDOC option can be enabled to get warnings for +# functions that are documented, but have no documentation for their parameters +# or return value. If set to NO (the default) doxygen will only warn about +# wrong or incomplete parameter documentation, but not about the absence of +# documentation. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that +# doxygen can produce. The string should contain the $file, $line, and $text +# tags, which will be replaced by the file and line number from which the +# warning originated and the warning text. Optionally the format may contain +# $version, which will be replaced by the version of the file (if it could +# be obtained via FILE_VERSION_FILTER) + +WARN_FORMAT = + +# The WARN_LOGFILE tag can be used to specify a file to which warning +# and error messages should be written. If left blank the output is written +# to stderr. + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag can be used to specify the files and/or directories that contain +# documented source files. You may enter file names like "myfile.cpp" or +# directories like "/usr/src/myproject". Separate the files or directories +# with spaces. + +INPUT = builtins.h \ + ctx.h \ + decl.h \ + expr.h \ + gatherbuf.h \ + ispc.h \ + llvmutil.h \ + module.h \ + opt.h \ + stmt.h \ + sym.h \ + type.h \ + util.h \ + builtins.cpp \ + ctx.cpp \ + decl.cpp \ + expr.cpp \ + gatherbuf.cpp \ + ispc.cpp \ + llvmutil.cpp \ + main.cpp \ + module.cpp \ + opt.cpp \ + stmt.cpp \ + sym.cpp \ + type.cpp \ + util.cpp \ + parse.yy \ + lex.ll \ + stdlib-c.c + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is +# also the default input encoding. Doxygen uses libiconv (or the iconv built +# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for +# the list of possible encodings. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank the following patterns are tested: +# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh +# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py +# *.f90 *.f *.vhd *.vhdl + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to turn specify whether or not subdirectories +# should be searched for input files as well. Possible values are YES and NO. +# If left blank NO is used. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used select whether or not files or +# directories that are symbolic links (a Unix filesystem feature) are excluded +# from the input. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. Note that the wildcards are matched +# against the file with absolute path, so to exclude all test directories +# for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or +# directories that contain example code fragments that are included (see +# the \include command). + +EXAMPLE_PATH = ./examples + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp +# and *.h) to filter out the source-files in the directories. If left +# blank all files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude +# commands irrespective of the value of the RECURSIVE tag. +# Possible values are YES and NO. If left blank NO is used. + +EXAMPLE_RECURSIVE = YES + +# The IMAGE_PATH tag can be used to specify one or more files or +# directories that contain image that are included in the documentation (see +# the \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command , where +# is the value of the INPUT_FILTER tag, and is the name of an +# input file. Doxygen will then use the output that the filter program writes +# to standard output. +# If FILTER_PATTERNS is specified, this tag will be +# ignored. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. +# Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. +# The filters are a list of the form: +# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further +# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER +# is applied to all files. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will be used to filter the input files when producing source +# files to browse (i.e. when SOURCE_BROWSER is set to YES). + +FILTER_SOURCE_FILES = NO + +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will +# be generated. Documented entities will be cross-referenced with these sources. +# Note: To get rid of all source code in the generated output, make sure also +# VERBATIM_HEADERS is set to NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body +# of functions and classes directly in the documentation. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct +# doxygen to hide any special comment blocks from generated source code +# fragments. Normal C and C++ comments will always remain visible. + +STRIP_CODE_COMMENTS = NO + +# If the REFERENCED_BY_RELATION tag is set to YES +# then for each documented function all documented +# functions referencing it will be listed. + +REFERENCED_BY_RELATION = YES + +# If the REFERENCES_RELATION tag is set to YES +# then for each documented function all documented entities +# called/used by that function will be listed. + +REFERENCES_RELATION = YES + +# If the REFERENCES_LINK_SOURCE tag is set to YES (the default) +# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from +# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will +# link to the source code. +# Otherwise they will link to the documentation. + +REFERENCES_LINK_SOURCE = YES + +# If the USE_HTAGS tag is set to YES then the references to source code +# will point to the HTML generated by the htags(1) tool instead of doxygen +# built-in source browser. The htags tool is part of GNU's global source +# tagging system (see http://www.gnu.org/software/global/global.html). You +# will need version 4.8.6 or higher. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen +# will generate a verbatim copy of the header file for each class for +# which an include is specified. Set to NO to disable this. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index +# of all compounds will be generated. Enable this if the project +# contains a lot of classes, structs, unions or interfaces. + +ALPHABETICAL_INDEX = YES + +# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then +# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns +# in which this list will be split (can be a number in the range [1..20]) + +COLS_IN_ALPHA_INDEX = 4 + +# In case all classes in a project start with a common prefix, all +# classes will be put under the same header in the alphabetical index. +# The IGNORE_PREFIX tag can be used to specify one or more prefixes that +# should be ignored while generating the index headers. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES (the default) Doxygen will +# generate HTML output. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `html' will be used as the default path. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for +# each generated HTML page (for example: .htm,.php,.asp). If it is left blank +# doxygen will generate files with .html extension. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a personal HTML header for +# each generated HTML page. If it is left blank doxygen will generate a +# standard header. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a personal HTML footer for +# each generated HTML page. If it is left blank doxygen will generate a +# standard footer. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading +# style sheet that is used by each HTML page. It can be used to +# fine-tune the look of the HTML output. If the tag is left blank doxygen +# will generate a default style sheet. Note that doxygen will try to copy +# the style sheet file to the HTML output directory, so don't put your own +# stylesheet in the HTML output directory as well, or it will be erased! + +HTML_STYLESHEET = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. +# Doxygen will adjust the colors in the stylesheet and background images +# according to this color. Hue is specified as an angle on a colorwheel, +# see http://en.wikipedia.org/wiki/Hue for more information. +# For instance the value 0 represents red, 60 is yellow, 120 is green, +# 180 is cyan, 240 is blue, 300 purple, and 360 is red again. +# The allowed range is 0 to 359. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of +# the colors in the HTML output. For a value of 0 the output will use +# grayscales only. A value of 255 will produce the most vivid colors. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to +# the luminance component of the colors in the HTML output. Values below +# 100 gradually make the output lighter, whereas values above 100 make +# the output darker. The value divided by 100 is the actual gamma applied, +# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, +# and 100 does not change the gamma. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting +# this to NO can help when comparing the output of multiple runs. + +HTML_TIMESTAMP = YES + +# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, +# files or namespaces will be aligned in HTML using tables. If set to +# NO a bullet list will be used. + +HTML_ALIGN_MEMBERS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. For this to work a browser that supports +# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox +# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari). + +HTML_DYNAMIC_SECTIONS = NO + +# If the GENERATE_DOCSET tag is set to YES, additional index files +# will be generated that can be used as input for Apple's Xcode 3 +# integrated development environment, introduced with OSX 10.5 (Leopard). +# To create a documentation set, doxygen will generate a Makefile in the +# HTML output directory. Running make will produce the docset in that +# directory and running "make install" will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find +# it at startup. +# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. + +GENERATE_DOCSET = NO + +# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the +# feed. A documentation feed provides an umbrella under which multiple +# documentation sets from a single provider (such as a company or product suite) +# can be grouped. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that +# should uniquely identify the documentation set bundle. This should be a +# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen +# will append .docset to the name. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES, additional index files +# will be generated that can be used as input for tools like the +# Microsoft HTML help workshop to generate a compiled HTML help file (.chm) +# of the generated HTML documentation. + +GENERATE_HTMLHELP = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can +# be used to specify the file name of the resulting .chm file. You +# can add a path in front of the file if the result should not be +# written to the html output directory. + +CHM_FILE = + +# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can +# be used to specify the location (absolute path including file name) of +# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run +# the HTML help compiler on the generated index.hhp. + +HHC_LOCATION = + +# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag +# controls if a separate .chi index file is generated (YES) or that +# it should be included in the master .chm file (NO). + +GENERATE_CHI = NO + +# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING +# is used to encode HtmlHelp index (hhk), content (hhc) and project file +# content. + +CHM_INDEX_ENCODING = + +# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag +# controls whether a binary table of contents is generated (YES) or a +# normal table of contents (NO) in the .chm file. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members +# to the contents of the HTML help documentation and to the tree view. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated +# that can be used as input for Qt's qhelpgenerator to generate a +# Qt Compressed Help (.qch) of the generated HTML documentation. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can +# be used to specify the file name of the resulting .qch file. +# The path specified is relative to the HTML output folder. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#namespace + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating +# Qt Help Project output. For more information please see +# http://doc.trolltech.com/qthelpproject.html#virtual-folders + +QHP_VIRTUAL_FOLDER = doc + +# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to +# add. For more information please see +# http://doc.trolltech.com/qthelpproject.html#custom-filters + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see +# +# Qt Help Project / Custom Filters. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's +# filter section matches. +# +# Qt Help Project / Filter Attributes. + +QHP_SECT_FILTER_ATTRS = + +# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can +# be used to specify the location of Qt's qhelpgenerator. +# If non-empty doxygen will try to run qhelpgenerator on the generated +# .qhp file. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files +# will be generated, which together with the HTML files, form an Eclipse help +# plugin. To install this plugin and make it available under the help contents +# menu in Eclipse, the contents of the directory containing the HTML and XML +# files needs to be copied into the plugins directory of eclipse. The name of +# the directory within the plugins directory should be the same as +# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before +# the help appears. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have +# this name. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# The DISABLE_INDEX tag can be used to turn on/off the condensed index at +# top of each HTML page. The value NO (the default) enables the index and +# the value YES disables it. + +DISABLE_INDEX = NO + +# This tag can be used to set the number of enum values (range [0,1..20]) +# that doxygen will group on one line in the generated HTML documentation. +# Note that a value of 0 will completely suppress the enum values from appearing in the overview section. + +ENUM_VALUES_PER_LINE = 4 + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. +# If the tag value is set to YES, a side panel will be generated +# containing a tree-like index structure (just like the one that +# is generated for HTML Help). For this to work a browser that supports +# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). +# Windows users are probably better off using the HTML help feature. + +GENERATE_TREEVIEW = NO + +# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories, +# and Class Hierarchy pages using a tree view instead of an ordered list. + +USE_INLINE_TREES = NO + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be +# used to set the initial width (in pixels) of the frame in which the tree +# is shown. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open +# links to external symbols imported via tag files in a separate window. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of Latex formulas included +# as images in the HTML documentation. The default is 10. Note that +# when you change the font size after a successful doxygen run you need +# to manually remove any form_*.png images from the HTML output directory +# to force them to be regenerated. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are +# not supported properly for IE 6.0, but are supported on all modern browsers. +# Note that when changing this option you need to delete any form_*.png files +# in the HTML output before the changes have effect. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax +# (see http://www.mathjax.org) which uses client side Javascript for the +# rendering instead of using prerendered bitmaps. Use this if you do not +# have LaTeX installed or if you want to formulas look prettier in the HTML +# output. When enabled you also need to install MathJax separately and +# configure the path to it using the MATHJAX_RELPATH option. + +USE_MATHJAX = NO + +# When MathJax is enabled you need to specify the location relative to the +# HTML output directory using the MATHJAX_RELPATH option. The destination +# directory should contain the MathJax.js script. For instance, if the mathjax +# directory is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing +# MathJax, but it is strongly recommended to install a local copy of MathJax +# before deployment. + +MATHJAX_RELPATH = http://www.mathjax.org/mathjax + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box +# for the HTML output. The underlying search engine uses javascript +# and DHTML and should work on any modern browser. Note that when using +# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets +# (GENERATE_DOCSET) there is already a search function so this one should +# typically be disabled. For large projects the javascript based search engine +# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. + +SEARCHENGINE = NO + +# When the SERVER_BASED_SEARCH tag is enabled the search engine will be +# implemented using a PHP enabled web server instead of at the web client +# using Javascript. Doxygen will generate the search PHP script and index +# file to put on the web server. The advantage of the server +# based approach is that it scales better to large projects and allows +# full text search. The disadvantages are that it is more difficult to setup +# and does not have live searching capabilities. + +SERVER_BASED_SEARCH = NO + +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- + +# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will +# generate Latex output. + +GENERATE_LATEX = NO + +# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `latex' will be used as the default path. + +LATEX_OUTPUT = + +# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be +# invoked. If left blank `latex' will be used as the default command name. +# Note that when enabling USE_PDFLATEX this option is only used for +# generating bitmaps for formulas in the HTML output, but not in the +# Makefile that is written to the output directory. + +LATEX_CMD_NAME = latex + +# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to +# generate index for LaTeX. If left blank `makeindex' will be used as the +# default command name. + +MAKEINDEX_CMD_NAME = makeindex + +# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact +# LaTeX documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_LATEX = NO + +# The PAPER_TYPE tag can be used to set the paper type that is used +# by the printer. Possible values are: a4, letter, legal and +# executive. If left blank a4wide will be used. + +PAPER_TYPE = letter + +# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX +# packages that should be included in the LaTeX output. + +EXTRA_PACKAGES = + +# The LATEX_HEADER tag can be used to specify a personal LaTeX header for +# the generated latex document. The header should contain everything until +# the first chapter. If it is left blank doxygen will generate a +# standard header. Notice: only use this tag if you know what you are doing! + +LATEX_HEADER = + +# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated +# is prepared for conversion to pdf (using ps2pdf). The pdf file will +# contain links (just like the HTML output) instead of page references +# This makes the output suitable for online browsing using a pdf viewer. + +PDF_HYPERLINKS = NO + +# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of +# plain latex in the generated Makefile. Set this option to YES to get a +# higher quality PDF documentation. + +USE_PDFLATEX = NO + +# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. +# command to the generated LaTeX files. This will instruct LaTeX to keep +# running if errors occur, instead of asking the user for help. +# This option is also used when generating formulas in HTML. + +LATEX_BATCHMODE = NO + +# If LATEX_HIDE_INDICES is set to YES then doxygen will not +# include the index chapters (such as File Index, Compound Index, etc.) +# in the output. + +LATEX_HIDE_INDICES = NO + +# If LATEX_SOURCE_CODE is set to YES then doxygen will include +# source code with syntax highlighting in the LaTeX output. +# Note that which sources are shown also depends on other settings +# such as SOURCE_BROWSER. + +LATEX_SOURCE_CODE = NO + +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- + +# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output +# The RTF output is optimized for Word 97 and may not look very pretty with +# other RTF readers or editors. + +GENERATE_RTF = NO + +# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `rtf' will be used as the default path. + +RTF_OUTPUT = + +# If the COMPACT_RTF tag is set to YES Doxygen generates more compact +# RTF documents. This may be useful for small projects and may help to +# save some trees in general. + +COMPACT_RTF = NO + +# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated +# will contain hyperlink fields. The RTF file will +# contain links (just like the HTML output) instead of page references. +# This makes the output suitable for online browsing using WORD or other +# programs which support those fields. +# Note: wordpad (write) and others do not support links. + +RTF_HYPERLINKS = NO + +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# config file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. + +RTF_STYLESHEET_FILE = + +# Set optional variables used in the generation of an rtf document. +# Syntax is similar to doxygen's config file. + +RTF_EXTENSIONS_FILE = + +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- + +# If the GENERATE_MAN tag is set to YES (the default) Doxygen will +# generate man pages + +GENERATE_MAN = NO + +# The MAN_OUTPUT tag is used to specify where the man pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `man' will be used as the default path. + +MAN_OUTPUT = + +# The MAN_EXTENSION tag determines the extension that is added to +# the generated man pages (default is the subroutine's section .3) + +MAN_EXTENSION = + +# If the MAN_LINKS tag is set to YES and Doxygen generates man output, +# then it will generate one additional man file for each entity +# documented in the real man page(s). These additional files +# only source the real man page, but without them the man command +# would be unable to find the correct page. The default is NO. + +MAN_LINKS = NO + +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- + +# If the GENERATE_XML tag is set to YES Doxygen will +# generate an XML file that captures the structure of +# the code including all documentation. + +GENERATE_XML = NO + +# The XML_OUTPUT tag is used to specify where the XML pages will be put. +# If a relative path is entered the value of OUTPUT_DIRECTORY will be +# put in front of it. If left blank `xml' will be used as the default path. + +XML_OUTPUT = xml + +# The XML_SCHEMA tag can be used to specify an XML schema, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_SCHEMA = + +# The XML_DTD tag can be used to specify an XML DTD, +# which can be used by a validating XML parser to check the +# syntax of the XML files. + +XML_DTD = + +# If the XML_PROGRAMLISTING tag is set to YES Doxygen will +# dump the program listings (including syntax highlighting +# and cross-referencing information) to the XML output. Note that +# enabling this will significantly increase the size of the XML output. + +XML_PROGRAMLISTING = YES + +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- + +# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will +# generate an AutoGen Definitions (see autogen.sf.net) file +# that captures the structure of the code including all +# documentation. Note that this feature is still experimental +# and incomplete at the moment. + +GENERATE_AUTOGEN_DEF = NO + +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- + +# If the GENERATE_PERLMOD tag is set to YES Doxygen will +# generate a Perl module file that captures the structure of +# the code including all documentation. Note that this +# feature is still experimental and incomplete at the +# moment. + +GENERATE_PERLMOD = NO + +# If the PERLMOD_LATEX tag is set to YES Doxygen will generate +# the necessary Makefile rules, Perl scripts and LaTeX code to be able +# to generate PDF and DVI output from the Perl module output. + +PERLMOD_LATEX = NO + +# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be +# nicely formatted so it can be parsed by a human reader. +# This is useful +# if you want to understand what is going on. +# On the other hand, if this +# tag is set to NO the size of the Perl module output will be much smaller +# and Perl will parse it just the same. + +PERLMOD_PRETTY = YES + +# The names of the make variables in the generated doxyrules.make file +# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. +# This is useful so different doxyrules.make files included by the same +# Makefile don't overwrite each other's variables. + +PERLMOD_MAKEVAR_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- + +# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will +# evaluate all C-preprocessor directives found in the sources and include +# files. + +ENABLE_PREPROCESSING = YES + +# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro +# names in the source code. If set to NO (the default) only conditional +# compilation will be performed. Macro expansion can be done in a controlled +# way by setting EXPAND_ONLY_PREDEF to YES. + +MACRO_EXPANSION = NO + +# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES +# then the macro expansion is limited to the macros specified with the +# PREDEFINED and EXPAND_AS_DEFINED tags. + +EXPAND_ONLY_PREDEF = NO + +# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files +# in the INCLUDE_PATH (see below) will be search if a #include is found. + +SEARCH_INCLUDES = YES + +# The INCLUDE_PATH tag can be used to specify one or more directories that +# contain include files that are not input files but should be processed by +# the preprocessor. + +INCLUDE_PATH = /usr/include + +# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard +# patterns (like *.h and *.hpp) to filter out the header-files in the +# directories. If left blank, the patterns specified with FILE_PATTERNS will +# be used. + +INCLUDE_FILE_PATTERNS = + +# The PREDEFINED tag can be used to specify one or more macro names that +# are defined before the preprocessor is started (similar to the -D option of +# gcc). The argument of the tag is a list of macros of the form: name +# or name=definition (no spaces). If the definition and the = are +# omitted =1 is assumed. To prevent a macro definition from being +# undefined via #undef or recursively expanded use the := operator +# instead of the = operator. + +PREDEFINED = + +# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then +# this tag can be used to specify a list of macro names that should be expanded. +# The macro definition that is found in the sources will be used. +# Use the PREDEFINED tag if you want to use a different macro definition. + +EXPAND_AS_DEFINED = + +# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then +# doxygen's preprocessor will remove all function-like macros that are alone +# on a line, have an all uppercase name, and do not end with a semicolon. Such +# function macros are typically used for boiler-plate code, and will confuse +# the parser if not removed. + +SKIP_FUNCTION_MACROS = YES + +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- + +# The TAGFILES option can be used to specify one or more tagfiles. +# Optionally an initial location of the external documentation +# can be added for each tagfile. The format of a tag file without +# this location is as follows: +# +# TAGFILES = file1 file2 ... +# Adding location for the tag files is done as follows: +# +# TAGFILES = file1=loc1 "file2 = loc2" ... +# where "loc1" and "loc2" can be relative or absolute paths or +# URLs. If a location is present for each tag, the installdox tool +# does not have to be run to correct the links. +# Note that each tag file must have a unique name +# (where the name does NOT include the path) +# If a tag file is not located in the directory in which doxygen +# is run, you must also specify the path to the tagfile here. + +TAGFILES = + +# When a file name is specified after GENERATE_TAGFILE, doxygen will create +# a tag file that is based on the input files it reads. + +GENERATE_TAGFILE = + +# If the ALLEXTERNALS tag is set to YES all external classes will be listed +# in the class index. If set to NO only the inherited external classes +# will be listed. + +ALLEXTERNALS = YES + +# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed +# in the modules index. If set to NO, only the current project's groups will +# be listed. + +EXTERNAL_GROUPS = YES + +# The PERL_PATH should be the absolute path and name of the perl script +# interpreter (i.e. the result of `which perl'). + +PERL_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- + +# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will +# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base +# or super classes. Setting the tag to NO turns the diagrams off. Note that +# this option also works with HAVE_DOT disabled, but it is recommended to +# install and use dot, since it yields more powerful graphs. + +CLASS_DIAGRAMS = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. Doxygen will then run the mscgen tool (see +# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the +# documentation. The MSCGEN_PATH tag allows you to specify the directory where +# the mscgen tool resides. If left empty the tool is assumed to be found in the +# default search path. + +MSCGEN_PATH = + +# If set to YES, the inheritance and collaboration graphs will hide +# inheritance and usage relations if the target is undocumented +# or is not a class. + +HIDE_UNDOC_RELATIONS = NO + +# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is +# available from the path. This tool is part of Graphviz, a graph visualization +# toolkit from AT&T and Lucent Bell Labs. The other options in this section +# have no effect if this option is set to NO (the default) + +HAVE_DOT = YES + +# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is +# allowed to run in parallel. When set to 0 (the default) doxygen will +# base this on the number of processors available in the system. You can set it +# explicitly to a value larger than 0 to get control over the balance +# between CPU load and processing speed. + +DOT_NUM_THREADS = 0 + +# By default doxygen will write a font called FreeSans.ttf to the output +# directory and reference it in all dot files that doxygen generates. This +# font does not include all possible unicode characters however, so when you need +# these (or just want a differently looking font) you can specify the font name +# using DOT_FONTNAME. You need need to make sure dot is able to find the font, +# which can be done by putting it in a standard location or by setting the +# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory +# containing the font. + +DOT_FONTNAME = FreeSans + +# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. +# The default size is 10pt. + +DOT_FONTSIZE = 10 + +# By default doxygen will tell dot to use the output directory to look for the +# FreeSans.ttf font (which doxygen will put there itself). If you specify a +# different font using DOT_FONTNAME you can set the path where dot +# can find it using this tag. + +DOT_FONTPATH = + +# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect inheritance relations. Setting this tag to YES will force the +# the CLASS_DIAGRAMS tag to NO. + +CLASS_GRAPH = YES + +# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for each documented class showing the direct and +# indirect implementation dependencies (inheritance, containment, and +# class references variables) of the class with other documented classes. + +COLLABORATION_GRAPH = YES + +# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen +# will generate a graph for groups, showing the direct groups dependencies + +GROUP_GRAPHS = YES + +# If the UML_LOOK tag is set to YES doxygen will generate inheritance and +# collaboration diagrams in a style similar to the OMG's Unified Modeling +# Language. + +UML_LOOK = NO + +# If set to YES, the inheritance and collaboration graphs will show the +# relations between templates and their instances. + +TEMPLATE_RELATIONS = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT +# tags are set to YES then doxygen will generate a graph for each documented +# file showing the direct and indirect include dependencies of the file with +# other documented files. + +INCLUDE_GRAPH = YES + +# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and +# HAVE_DOT tags are set to YES then doxygen will generate a graph for each +# documented header file showing the documented files that directly or +# indirectly include this file. + +INCLUDED_BY_GRAPH = YES + +# If the CALL_GRAPH and HAVE_DOT options are set to YES then +# doxygen will generate a call dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable call graphs +# for selected functions only using the \callgraph command. + +CALL_GRAPH = NO + +# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then +# doxygen will generate a caller dependency graph for every global function +# or class method. Note that enabling this option will significantly increase +# the time of a run. So in most cases it will be better to enable caller +# graphs for selected functions only using the \callergraph command. + +CALLER_GRAPH = NO + +# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen +# will generate a graphical hierarchy of all classes instead of a textual one. + +GRAPHICAL_HIERARCHY = YES + +# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES +# then doxygen will show the dependencies a directory has on other directories +# in a graphical way. The dependency relations are determined by the #include +# relations between the files in the directories. + +DIRECTORY_GRAPH = YES + +# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images +# generated by dot. Possible values are png, jpg, or gif. +# If left blank png will be used. + +DOT_IMAGE_FORMAT = png + +# The tag DOT_PATH can be used to specify the path where the dot tool can be +# found. If left blank, it is assumed the dot tool can be found in the path. + +DOT_PATH = /usr/local/bin/dot + +# The DOTFILE_DIRS tag can be used to specify one or more directories that +# contain dot files that are included in the documentation (see the +# \dotfile command). + +DOTFILE_DIRS = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the +# \mscfile command). + +MSCFILE_DIRS = + +# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of +# nodes that will be shown in the graph. If the number of nodes in a graph +# becomes larger than this value, doxygen will truncate the graph, which is +# visualized by representing a node as a red box. Note that doxygen if the +# number of direct children of the root node in a graph is already larger than +# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note +# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. + +DOT_GRAPH_MAX_NODES = 50 + +# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the +# graphs generated by dot. A depth value of 3 means that only nodes reachable +# from the root by following a path via at most 3 edges will be shown. Nodes +# that lay further from the root node will be omitted. Note that setting this +# option to 1 or 2 may greatly reduce the computation time needed for large +# code bases. Also note that the size of a graph can be further restricted by +# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. + +MAX_DOT_GRAPH_DEPTH = 0 + +# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent +# background. This is disabled by default, because dot on Windows does not +# seem to support this out of the box. Warning: Depending on the platform used, +# enabling this option may lead to badly anti-aliased labels on the edges of +# a graph (i.e. they become hard to read). + +DOT_TRANSPARENT = YES + +# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output +# files in one run (i.e. multiple -o and -T options on the command line). This +# makes dot run faster, but since only newer versions of dot (>1.8.10) +# support this, this feature is disabled by default. + +DOT_MULTI_TARGETS = NO + +# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will +# generate a legend page explaining the meaning of the various boxes and +# arrows in the dot generated graphs. + +GENERATE_LEGEND = YES + +# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will +# remove the intermediate dot files that are used to generate +# the various graphs. + +DOT_CLEANUP = YES diff --git a/examples/README.txt b/examples/README.txt new file mode 100644 index 00000000..7ef078db --- /dev/null +++ b/examples/README.txt @@ -0,0 +1,88 @@ +==================== +ISPC Examples README +==================== + +This directory has a number of sample ispc programs. Before building them +(on an system), install the appropriate ispc compiler binary into a +directory in your path. Then, if you're running Windows, open the +"examples.sln" file and built from there. For building under Linux/OSX, +there are makefiles in each directory that build the examples individually. + +Almost all of them benchmark ispc implementations of the given computation +against regular serial C++ implementations, printing out a comparison of +the runtimes and the speedup delivered by ispc. It may be instructive to +do a side-by-side diff of the C++ and ispc implementations of these +algorithms to learn more about wirting ispc code. + +AOBench +======= + +This is an ISPC implementation of the "AO bench" benchmark +(http://syoyo.wordpress.com/2009/01/26/ao-bench-is-evolving/). The command +line arguments are: + +ao (num iterations) (x res) (yres) + +It executes the program for the given number of iterations, rendering an +(xres x yres) image each time and measuring the computation time with both +serial and ispc implementations. + +AOBench_Instrumented +==================== + +This version of AO Bench is compiled with the --instrument ispc compiler +flag. This causes the compiler to emit calls to a (user-supplied) +ISPCInstrument() function at interesting places in the compiled code. An +example implementation of this function that counts the number of times the +callback is made and records some statistics about control flow coherence +is provided in the instrument.cpp file. + +*** Note: on Linux, this example currently hits an assertion in LLVM during +*** compilation + +Mandelbrot +========== + +Mandelbrot set generation. This example is extensively documented at the +http://ispc.github.com/example.html page. + +Mandelbrot_tasks +================ + +Implementation of Mandelbrot set generation that also parallelizes across +cores using tasks. Under Windows, a simple task system built on +Microsoft's Concurrency Runtime is used (see tasks_concrt.cpp). On OSX, a +task system based on Grand Central Dispatch is used (tasks_gcd.cpp), and on +Linux, a pthreads-based task system is used (tasks_pthreads.cpp). When +using tasks with ispc, no task system is mandated; the user is free to plug +in any task system they want, for ease of interoperating with existing task +systems. + +Options +======= + +This program implements both the Black-Scholes and Binomial options pricing +models in both ispc and regular serial C++ code. + +RT +== + +This is a simple ray tracer; it reads in camera parameters and a bounding +volume hierarchy and renders the scene from the given viewpoint. The +command line arguments are: + +rt + +Where is one of "cornell", "teapot", or "sponza". + +The implementation originally derives from the bounding volume hierarchy +and triangle intersection code from pbrt; see the pbrt source code and/or +"Physically Based Rendering" book for more about the basic algorithmic +details. + +Simple +====== + +This is a simple "hello world" type program that shows a ~10 line +application program calling out to a ~5 line ispc program to do a simple +computation. diff --git a/examples/aobench/Makefile b/examples/aobench/Makefile new file mode 100644 index 00000000..8674f7bb --- /dev/null +++ b/examples/aobench/Makefile @@ -0,0 +1,26 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -O3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 --fast-math + +default: ao + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ ao + +ao: dirs objs/ao.o objs/ao_serial.o objs/ao_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/ao_serial.o -lm -lpthread + +objs/%.o: %.cpp + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/ao.o: objs/ao_ispc.h + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/aobench/ao.cpp b/examples/aobench/ao.cpp new file mode 100644 index 00000000..1a2eefe5 --- /dev/null +++ b/examples/aobench/ao.cpp @@ -0,0 +1,182 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif +#include +#include +#include +#include +#include + +#include "ao_ispc.h" +using namespace ispc; + +#include "../timing.h" + +#define NSUBSAMPLES 2 + +extern void ao_serial(int w, int h, int nsubsamples, float image[]); + +static unsigned int test_iterations; +static unsigned int width, height; +static unsigned char *img; +static float *fimg; + + +static unsigned char +clamp(float f) +{ + int i = (int)(f * 255.5); + + if (i < 0) i = 0; + if (i > 255) i = 255; + + return (unsigned char)i; +} + + +static void +savePPM(const char *fname, int w, int h) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]); + img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]); + img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]); + } + } + + FILE *fp = fopen(fname, "wb"); + if (!fp) { + perror(fname); + exit(1); + } + + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", w, h); + fprintf(fp, "255\n"); + fwrite(img, w * h * 3, 1, fp); + fclose(fp); +} + + +// Allocate memory with 64-byte alignment. +float * +AllocAligned(int size) { +#if defined(_WIN32) || defined(_WIN64) + return (float *)_aligned_malloc(size, 64); +#elif defined (__APPLE__) + // Allocate excess memory to ensure an aligned pointer can be returned + void *mem = malloc(size + (64-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem += 64 - (reinterpret_cast(amem) & (64 - 1)); + ((void**)amem)[-1] = mem; + return (float *)amem; +#else + return (float *)memalign(64, size); +#endif +} + + +int main(int argc, char **argv) +{ + if (argc != 4) { + printf ("%s\n", argv[0]); + printf ("Usage: ao [num test iterations] [width] [height]\n"); + getchar(); + exit(-1); + } + else { + test_iterations = atoi(argv[1]); + width = atoi (argv[2]); + height = atoi (argv[3]); + } + + // Allocate space for output images + img = (unsigned char *)AllocAligned(width * height * 3); + fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + + // + // Run the ispc path, test_iterations times, and report the minimum + // time for any of them. + // + double minTimeISPC = 1e30; + for (unsigned int i = 0; i < test_iterations; i++) { + memset((void *)fimg, 0, sizeof(float) * width * height * 3); + assert(NSUBSAMPLES == 2); + + reset_and_start_timer(); + ao_ispc(width, height, NSUBSAMPLES, fimg); + double t = get_elapsed_mcycles(); + minTimeISPC = std::min(minTimeISPC, t); + } + + // Report results and save image + printf("[aobench ispc]:\t\t\t[%.3f] M cycles (%d x %d image)\n", minTimeISPC, + width, height); + savePPM("ao-ispc.ppm", width, height); + + // + // Run the serial path, again test_iteration times, and report the + // minimum time. + // + double minTimeSerial = 1e30; + for (unsigned int i = 0; i < test_iterations; i++) { + memset((void *)fimg, 0, sizeof(float) * width * height * 3); + reset_and_start_timer(); + ao_serial(width, height, NSUBSAMPLES, fimg); + double t = get_elapsed_mcycles(); + minTimeSerial = std::min(minTimeSerial, t); + } + + // Report more results, save another image... + printf("[aobench serial]:\t\t[%.3f] M cycles (%d x %d image)\n", minTimeSerial, + width, height); + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC); + savePPM("ao-serial.ppm", width, height); + + return 0; +} diff --git a/examples/aobench/ao.ispc b/examples/aobench/ao.ispc new file mode 100644 index 00000000..192e0666 --- /dev/null +++ b/examples/aobench/ao.ispc @@ -0,0 +1,317 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench +*/ + +#define NAO_SAMPLES 8 +#define M_PI 3.1415926535f + +typedef float<3> vec; + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; + +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +static inline float dot(vec a, vec b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +static inline vec vcross(vec v0, vec v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +static inline void vnormalize(reference vec v) { + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v *= invlen; +} + + +static inline void +ray_plane_intersect(reference Isect isect, reference Ray ray, + reference Plane plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + cif (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +static inline void +ray_sphere_intersect(reference Isect isect, reference Ray ray, + reference Sphere sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + cif (D > 0.) { + float t = -B - sqrt(D); + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +static inline void +orthoBasis(reference vec basis[3], vec n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +static inline float +ambient_occlusion(reference Isect isect, reference Plane plane, + reference Sphere spheres[3], reference RNGState rngstate) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + eps * isect.n; + + orthoBasis(basis, isect.n); + + static const uniform int ntheta = NAO_SAMPLES; + static const uniform int nphi = NAO_SAMPLES; + for (uniform int j = 0; j < ntheta; j++) { + for (uniform int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrt(frandom(rngstate)); + float phi = 2.0f * M_PI * frandom(rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17; + occIsect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.0; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, + uniform int nsubsamples, reference uniform float image[]) { + static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + static Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; + + seed_rng(rngstate, y0); + + // Compute the mapping between the 'programCount'-wide program + // instances running in parallel and samples in the image. + // + // For now, we'll always take four samples per pixel, so start by + // initializing du and dv with offsets into subpixel samples. We'll + // take care of further updating du and dv for the case where we're + // doing more than 4 program instances in parallel shortly. + uniform float uSteps[4] = { 0, 1, 0, 1 }; + uniform float vSteps[4] = { 0, 0, 1, 1 }; + float du = uSteps[programIndex % 4] / nsubsamples; + float dv = vSteps[programIndex % 4] / nsubsamples; + + // Now handle the case where we are able to do more than one pixel's + // worth of work at once. nx records the number of pixels in the x + // direction we do per iteration and ny the number in y. + uniform int nx = 1, ny = 1; + + if (programCount == 8) { + // Do two pixels at once in the x direction + nx = 2; + if (programIndex >= 4) + // And shift the offsets for the second pixel's worth of work + ++du; + } + else if (programCount == 16) { + // Two at once in both x and y + nx = ny = 2; + if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12) + ++du; + if (programIndex >= 8) + ++dv; + } + + // Now loop over all of the pixels, stepping in x and y as calculated + // above. (Assumes that ny divides y and nx divides x...) + for (uniform int y = y0; y < y1; y += ny) { + for (uniform int x = 0; x < w; x += nx) { + // Figur out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = 0.f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + cif (isect.hit) + ret = ambient_occlusion(isect, plane, spheres, rngstate); + + // This is a little grungy; we have results for + // programCount-worth of values. Because we're doing 2x2 + // subsamples, we need to peel them off in groups of four, + // average the four values for each pixel, and update the + // output image. + // + // Store the varying value to a uniform array of the same size. + // See the discussion about communication among program + // instances in the ispc user's manual for more discussion on + // this idiom. + uniform float retArray[programCount]; + retArray[programIndex] = ret; + + // offset to the first pixel in the image + uniform int offset = 3 * (y * w + x); + for (uniform int p = 0; p < programCount; p += 4, ++offset) { + // Get the four sample values for this pixel + uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] + + retArray[p+3]; + + // Normalize by number of samples taken + sumret /= nsubsamples * nsubsamples; + + // Store result in the image + image[offset+0] = sumret; + image[offset+1] = sumret; + image[offset+2] = sumret; + } + } + } +} + + +export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, + uniform float image[]) { + ao_scanlines(0, h, w, h, nsubsamples, image); +} diff --git a/examples/aobench/ao_serial.cpp b/examples/aobench/ao_serial.cpp new file mode 100644 index 00000000..0b3e2b6d --- /dev/null +++ b/examples/aobench/ao_serial.cpp @@ -0,0 +1,314 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include + +#ifdef _MSC_VER +static long long drand48_x = 0x1234ABCD330E; + +static inline void srand48(int x) { + drand48_x = x ^ (x << 16); +} + +static inline double drand48() { + drand48_x = drand48_x * 0x5DEECE66D + 0xB; + return (drand48_x & 0xFFFFFFFFFFFF) * (1.0 / 281474976710656.0); +} +#endif // _MSC_VER + +#ifdef _MSC_VER +__declspec(align(16)) +#endif +struct vec { + vec() { x=y=z=pad=0.; } + vec(float xx, float yy, float zz) { x = xx; y = yy; z = zz; } + + vec operator*(float f) const { return vec(x*f, y*f, z*f); } + vec operator+(const vec &f2) const { + return vec(x+f2.x, y+f2.y, z+f2.z); + } + vec operator-(const vec &f2) const { + return vec(x-f2.x, y-f2.y, z-f2.z); + } + vec operator*(const vec &f2) const { + return vec(x*f2.x, y*f2.y, z*f2.z); + } + float x, y, z; + float pad; +} +#ifndef _MSC_VER +__attribute__ ((aligned(16))) +#endif +; +inline vec operator*(float f, const vec &v) { return vec(f*v.x, f*v.y, f*v.z); } + + +#define NAO_SAMPLES 8 + +#ifdef M_PI +#undef M_PI +#endif +#define M_PI 3.1415926535f + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; + +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +static inline float dot(const vec &a, const vec &b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +static inline vec vcross(const vec &v0, const vec &v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +static inline void vnormalize(vec &v) { + float len2 = dot(v, v); + float invlen = 1.f / sqrtf(len2); + v = v * invlen; +} + + +static inline void +ray_plane_intersect(Isect &isect, Ray &ray, + Plane &plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + if (fabsf(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +static inline void +ray_sphere_intersect(Isect &isect, Ray &ray, + Sphere &sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + if (D > 0.) { + float t = -B - sqrtf(D); + + if ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +static inline void +orthoBasis(vec basis[3], const vec &n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +static float +ambient_occlusion(Isect &isect, Plane &plane, + Sphere spheres[3]) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + eps * isect.n; + + orthoBasis(basis, isect.n); + + static const int ntheta = NAO_SAMPLES; + static const int nphi = NAO_SAMPLES; + for (int j = 0; j < ntheta; j++) { + for (int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrtf(drand48()); + float phi = 2.0f * M_PI * drand48(); + float x = cosf(phi) * theta; + float y = sinf(phi) * theta; + float z = sqrtf(1.0 - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17; + occIsect.hit = 0; + + for (int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.0; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +static void ao_scanlines(int y0, int y1, int w, int h, int nsubsamples, + float image[]) { + static Plane plane = { vec(0.0f, -0.5f, 0.0f), vec(0.f, 1.f, 0.f) }; + static Sphere spheres[3] = { + { vec(-2.0f, 0.0f, -3.5f), 0.5f }, + { vec(-0.5f, 0.0f, -3.0f), 0.5f }, + { vec(1.0f, 0.0f, -2.2f), 0.5f } }; + + srand48(y0); + + for (int y = y0; y < y1; ++y) { + for (int x = 0; x < w; ++x) { + int offset = 3 * (y * w + x); + for (int u = 0; u < nsubsamples; ++u) { + for (int v = 0; v < nsubsamples; ++v) { + float px = (x + (u / (float)nsubsamples) - (w / 2.0f)) / (w / 2.0f); + float py = -(y + (v / (float)nsubsamples) - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = vec(0.f, 0.f, 0.f); + + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + if (isect.hit) + ret = ambient_occlusion(isect, plane, spheres); + + // Update image for AO for this ray + image[offset+0] += ret; + image[offset+1] += ret; + image[offset+2] += ret; + } + } + // Normalize image pixels by number of samples taken per pixel + image[offset+0] /= nsubsamples * nsubsamples; + image[offset+1] /= nsubsamples * nsubsamples; + image[offset+2] /= nsubsamples * nsubsamples; + } + } +} + + +void ao_serial(int w, int h, int nsubsamples, + float image[]) { + ao_scanlines(0, h, w, h, nsubsamples, image); +} diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj new file mode 100755 index 00000000..3be6bdb3 --- /dev/null +++ b/examples/aobench/aobench.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + Document + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + + + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} + Win32Proj + aobench + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + + \ No newline at end of file diff --git a/examples/aobench_instrumented/Makefile b/examples/aobench_instrumented/Makefile new file mode 100644 index 00000000..296a5882 --- /dev/null +++ b/examples/aobench_instrumented/Makefile @@ -0,0 +1,26 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -g3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 --fast-math --instrument + +default: ao + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ ao + +ao: dirs objs/ao.o objs/instrument.o objs/ao_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/ao.o objs/ao_ispc.o objs/instrument.o -lm -lpthread + +objs/%.o: %.cpp + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/ao.o: objs/ao_ispc.h + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/aobench_instrumented/ao.cpp b/examples/aobench_instrumented/ao.cpp new file mode 100644 index 00000000..742a0862 --- /dev/null +++ b/examples/aobench_instrumented/ao.cpp @@ -0,0 +1,148 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif +#include +#include +#include +#include +#include + +#include "ao_ispc.h" +using namespace ispc; + +#include "instrument.h" +#include "../timing.h" + +#define NSUBSAMPLES 2 + +static unsigned int test_iterations; +static unsigned int width, height; +static unsigned char *img; +static float *fimg; + + +static unsigned char +clamp(float f) +{ + int i = (int)(f * 255.5); + + if (i < 0) i = 0; + if (i > 255) i = 255; + + return (unsigned char)i; +} + + +static void +savePPM(const char *fname, int w, int h) +{ + for (int y = 0; y < h; y++) { + for (int x = 0; x < w; x++) { + img[3 * (y * w + x) + 0] = clamp(fimg[3 *(y * w + x) + 0]); + img[3 * (y * w + x) + 1] = clamp(fimg[3 *(y * w + x) + 1]); + img[3 * (y * w + x) + 2] = clamp(fimg[3 *(y * w + x) + 2]); + } + } + + FILE *fp = fopen(fname, "wb"); + if (!fp) { + perror(fname); + exit(1); + } + + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", w, h); + fprintf(fp, "255\n"); + fwrite(img, w * h * 3, 1, fp); + fclose(fp); +} + + +// Allocate memory with 64-byte alignment. +float * +AllocAligned(int size) { +#if defined(_WIN32) || defined(_WIN64) + return (float *)_aligned_malloc(size, 64); +#elif defined (__APPLE__) + // Allocate excess memory to ensure an aligned pointer can be returned + void *mem = malloc(size + (64-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem += 64 - (reinterpret_cast(amem) & (64 - 1)); + ((void**)amem)[-1] = mem; + return (float *)amem; +#else + return (float *)memalign(64, size); +#endif +} + + +int main(int argc, char **argv) +{ + if (argc != 4) { + printf ("%s\n", argv[0]); + printf ("Usage: ao [num test iterations] [width] [height]\n"); + getchar(); + exit(-1); + } + else { + test_iterations = atoi(argv[1]); + width = atoi (argv[2]); + height = atoi (argv[3]); + } + + // Allocate space for output images + img = (unsigned char *)AllocAligned(width * height * 3); + fimg = (float *)AllocAligned(sizeof(float) * width * height * 3); + + ao_ispc(width, height, NSUBSAMPLES, fimg); + + savePPM("ao-ispc.ppm", width, height); + + ISPCPrintInstrument(); + + return 0; +} diff --git a/examples/aobench_instrumented/ao.ispc b/examples/aobench_instrumented/ao.ispc new file mode 100644 index 00000000..192e0666 --- /dev/null +++ b/examples/aobench_instrumented/ao.ispc @@ -0,0 +1,317 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ +/* + Based on Syoyo Fujita's aobench: http://code.google.com/p/aobench +*/ + +#define NAO_SAMPLES 8 +#define M_PI 3.1415926535f + +typedef float<3> vec; + +struct Isect { + float t; + vec p; + vec n; + int hit; +}; + +struct Sphere { + vec center; + float radius; + +}; + +struct Plane { + vec p; + vec n; +}; + +struct Ray { + vec org; + vec dir; +}; + +static inline float dot(vec a, vec b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + +static inline vec vcross(vec v0, vec v1) { + vec ret; + ret.x = v0.y * v1.z - v0.z * v1.y; + ret.y = v0.z * v1.x - v0.x * v1.z; + ret.z = v0.x * v1.y - v0.y * v1.x; + return ret; +} + +static inline void vnormalize(reference vec v) { + float len2 = dot(v, v); + float invlen = rsqrt(len2); + v *= invlen; +} + + +static inline void +ray_plane_intersect(reference Isect isect, reference Ray ray, + reference Plane plane) { + float d = -dot(plane.p, plane.n); + float v = dot(ray.dir, plane.n); + + cif (abs(v) < 1.0e-17) + return; + else { + float t = -(dot(ray.org, plane.n) + d) / v; + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + ray.dir * t; + isect.n = plane.n; + } + } +} + + +static inline void +ray_sphere_intersect(reference Isect isect, reference Ray ray, + reference Sphere sphere) { + vec rs = ray.org - sphere.center; + + float B = dot(rs, ray.dir); + float C = dot(rs, rs) - sphere.radius * sphere.radius; + float D = B * B - C; + + cif (D > 0.) { + float t = -B - sqrt(D); + + cif ((t > 0.0) && (t < isect.t)) { + isect.t = t; + isect.hit = 1; + isect.p = ray.org + t * ray.dir; + isect.n = isect.p - sphere.center; + vnormalize(isect.n); + } + } +} + + +static inline void +orthoBasis(reference vec basis[3], vec n) { + basis[2] = n; + basis[1].x = 0.0; basis[1].y = 0.0; basis[1].z = 0.0; + + if ((n.x < 0.6) && (n.x > -0.6)) { + basis[1].x = 1.0; + } else if ((n.y < 0.6) && (n.y > -0.6)) { + basis[1].y = 1.0; + } else if ((n.z < 0.6) && (n.z > -0.6)) { + basis[1].z = 1.0; + } else { + basis[1].x = 1.0; + } + + basis[0] = vcross(basis[1], basis[2]); + vnormalize(basis[0]); + + basis[1] = vcross(basis[2], basis[0]); + vnormalize(basis[1]); +} + + +static inline float +ambient_occlusion(reference Isect isect, reference Plane plane, + reference Sphere spheres[3], reference RNGState rngstate) { + float eps = 0.0001f; + vec p, n; + vec basis[3]; + float occlusion = 0.0; + + p = isect.p + eps * isect.n; + + orthoBasis(basis, isect.n); + + static const uniform int ntheta = NAO_SAMPLES; + static const uniform int nphi = NAO_SAMPLES; + for (uniform int j = 0; j < ntheta; j++) { + for (uniform int i = 0; i < nphi; i++) { + Ray ray; + Isect occIsect; + + float theta = sqrt(frandom(rngstate)); + float phi = 2.0f * M_PI * frandom(rngstate); + float x = cos(phi) * theta; + float y = sin(phi) * theta; + float z = sqrt(1.0 - theta * theta); + + // local . global + float rx = x * basis[0].x + y * basis[1].x + z * basis[2].x; + float ry = x * basis[0].y + y * basis[1].y + z * basis[2].y; + float rz = x * basis[0].z + y * basis[1].z + z * basis[2].z; + + ray.org = p; + ray.dir.x = rx; + ray.dir.y = ry; + ray.dir.z = rz; + + occIsect.t = 1.0e+17; + occIsect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(occIsect, ray, spheres[snum]); + ray_plane_intersect (occIsect, ray, plane); + + if (occIsect.hit) occlusion += 1.0; + } + } + + occlusion = (ntheta * nphi - occlusion) / (float)(ntheta * nphi); + return occlusion; +} + + +/* Compute the image for the scanlines from [y0,y1), for an overall image + of width w and height h. + */ +void ao_scanlines(uniform int y0, uniform int y1, uniform int w, uniform int h, + uniform int nsubsamples, reference uniform float image[]) { + static Plane plane = { { 0.0f, -0.5f, 0.0f }, { 0.f, 1.f, 0.f } }; + static Sphere spheres[3] = { + { { -2.0f, 0.0f, -3.5f }, 0.5f }, + { { -0.5f, 0.0f, -3.0f }, 0.5f }, + { { 1.0f, 0.0f, -2.2f }, 0.5f } }; + RNGState rngstate; + + seed_rng(rngstate, y0); + + // Compute the mapping between the 'programCount'-wide program + // instances running in parallel and samples in the image. + // + // For now, we'll always take four samples per pixel, so start by + // initializing du and dv with offsets into subpixel samples. We'll + // take care of further updating du and dv for the case where we're + // doing more than 4 program instances in parallel shortly. + uniform float uSteps[4] = { 0, 1, 0, 1 }; + uniform float vSteps[4] = { 0, 0, 1, 1 }; + float du = uSteps[programIndex % 4] / nsubsamples; + float dv = vSteps[programIndex % 4] / nsubsamples; + + // Now handle the case where we are able to do more than one pixel's + // worth of work at once. nx records the number of pixels in the x + // direction we do per iteration and ny the number in y. + uniform int nx = 1, ny = 1; + + if (programCount == 8) { + // Do two pixels at once in the x direction + nx = 2; + if (programIndex >= 4) + // And shift the offsets for the second pixel's worth of work + ++du; + } + else if (programCount == 16) { + // Two at once in both x and y + nx = ny = 2; + if ((programIndex >= 4 && programIndex < 8) || programIndex >= 12) + ++du; + if (programIndex >= 8) + ++dv; + } + + // Now loop over all of the pixels, stepping in x and y as calculated + // above. (Assumes that ny divides y and nx divides x...) + for (uniform int y = y0; y < y1; y += ny) { + for (uniform int x = 0; x < w; x += nx) { + // Figur out x,y pixel in NDC + float px = (x + du - (w / 2.0f)) / (w / 2.0f); + float py = -(y + dv - (h / 2.0f)) / (h / 2.0f); + float ret = 0.f; + Ray ray; + Isect isect; + + ray.org = 0.f; + + // Poor man's perspective projection + ray.dir.x = px; + ray.dir.y = py; + ray.dir.z = -1.0; + vnormalize(ray.dir); + + isect.t = 1.0e+17; + isect.hit = 0; + + for (uniform int snum = 0; snum < 3; ++snum) + ray_sphere_intersect(isect, ray, spheres[snum]); + ray_plane_intersect(isect, ray, plane); + + // Note use of 'coherent' if statement; the set of rays we + // trace will often all hit or all miss the scene + cif (isect.hit) + ret = ambient_occlusion(isect, plane, spheres, rngstate); + + // This is a little grungy; we have results for + // programCount-worth of values. Because we're doing 2x2 + // subsamples, we need to peel them off in groups of four, + // average the four values for each pixel, and update the + // output image. + // + // Store the varying value to a uniform array of the same size. + // See the discussion about communication among program + // instances in the ispc user's manual for more discussion on + // this idiom. + uniform float retArray[programCount]; + retArray[programIndex] = ret; + + // offset to the first pixel in the image + uniform int offset = 3 * (y * w + x); + for (uniform int p = 0; p < programCount; p += 4, ++offset) { + // Get the four sample values for this pixel + uniform float sumret = retArray[p] + retArray[p+1] + retArray[p+2] + + retArray[p+3]; + + // Normalize by number of samples taken + sumret /= nsubsamples * nsubsamples; + + // Store result in the image + image[offset+0] = sumret; + image[offset+1] = sumret; + image[offset+2] = sumret; + } + } + } +} + + +export void ao_ispc(uniform int w, uniform int h, uniform int nsubsamples, + uniform float image[]) { + ao_scanlines(0, h, w, h, nsubsamples, image); +} diff --git a/examples/aobench_instrumented/aobench_instrumented.vcxproj b/examples/aobench_instrumented/aobench_instrumented.vcxproj new file mode 100755 index 00000000..94c8926a --- /dev/null +++ b/examples/aobench_instrumented/aobench_instrumented.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + Document + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument + + %(Filename).obj + %(Filename).obj + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --instrument + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --instrument + + %(Filename).obj + %(Filename).obj + + + + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958} + Win32Proj + aobench_instrumented + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + + diff --git a/examples/aobench_instrumented/instrument.cpp b/examples/aobench_instrumented/instrument.cpp new file mode 100644 index 00000000..d72210d6 --- /dev/null +++ b/examples/aobench_instrumented/instrument.cpp @@ -0,0 +1,94 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "instrument.h" +#include +#include +#include +#include + +struct CallInfo { + CallInfo() { count = laneCount = allOff = 0; } + int count; + int laneCount; + int allOff; +}; + +static std::map callInfo; + +int countbits(int i) { + int ret = 0; + while (i) { + if (i & 0x1) + ++ret; + i >>= 1; + } + return ret; +} + + +// Callback function that ispc compiler emits calls to when --instrument +// command-line flag is given while compiling. +void +ISPCInstrument(const char *fn, const char *note, int line, int mask) { + char sline[16]; + sprintf(sline, "%04d", line); + std::string s = std::string(fn) + std::string("(") + std::string(sline) + + std::string(") - ") + std::string(note); + + // Find or create a CallInfo instance for this callsite. + CallInfo &ci = callInfo[s]; + + // And update its statistics... + ++ci.count; + if (mask == 0) + ++ci.allOff; + ci.laneCount += countbits(mask); +} + + +void +ISPCPrintInstrument() { + // When program execution is done, go through the stats and print them + // out. (This function is called by ao.cpp). + std::map::iterator citer = callInfo.begin(); + while (citer != callInfo.end()) { + CallInfo &ci = citer->second; + float activePct = 100.f * ci.laneCount / (4.f * ci.count); + float allOffPct = 100.f * ci.allOff / ci.count; + printf("%s: %d calls (%d / %.2f%% all off!), %.2f%% active lanes\n", + citer->first.c_str(), ci.count, ci.allOff, allOffPct, + activePct); + ++citer; + } +} diff --git a/examples/aobench_instrumented/instrument.h b/examples/aobench_instrumented/instrument.h new file mode 100644 index 00000000..a21730b1 --- /dev/null +++ b/examples/aobench_instrumented/instrument.h @@ -0,0 +1,45 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef INSTRUMENT_H +#define INSTRUMENT_H 1 + +#include + +extern "C" { + void ISPCInstrument(const char *fn, const char *note, int line, int mask); +} + +void ISPCPrintInstrument(); + +#endif // INSTRUMENT_H diff --git a/examples/examples.sln b/examples/examples.sln new file mode 100755 index 00000000..5e8de17a --- /dev/null +++ b/examples/examples.sln @@ -0,0 +1,86 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple", "simple\simple.vcxproj", "{947C5311-8B78-4D05-BEE4-BCF342D4B367}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rt", "rt\rt.vcxproj", "{E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench", "aobench\aobench.vcxproj", "{F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot", "mandelbrot\mandelbrot.vcxproj", "{6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "options", "options\options.vcxproj", "{8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot_tasks", "mandelbrot_tasks\mandelbrot_tasks.vcxproj", "{E80DA7D4-AB22-4648-A068-327307156BE6}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "aobench_instrumented", "aobench_instrumented\aobench_instrumented.vcxproj", "{B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.ActiveCfg = Debug|Win32 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|Win32.Build.0 = Debug|Win32 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.ActiveCfg = Debug|x64 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Debug|x64.Build.0 = Debug|x64 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.ActiveCfg = Release|Win32 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|Win32.Build.0 = Release|Win32 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.ActiveCfg = Release|x64 + {947C5311-8B78-4D05-BEE4-BCF342D4B367}.Release|x64.Build.0 = Release|x64 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.ActiveCfg = Debug|Win32 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|Win32.Build.0 = Debug|Win32 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.ActiveCfg = Debug|x64 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Debug|x64.Build.0 = Debug|x64 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.ActiveCfg = Release|Win32 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|Win32.Build.0 = Release|Win32 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.ActiveCfg = Release|x64 + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9}.Release|x64.Build.0 = Release|x64 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.ActiveCfg = Debug|Win32 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|Win32.Build.0 = Debug|Win32 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.ActiveCfg = Debug|x64 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Debug|x64.Build.0 = Debug|x64 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.ActiveCfg = Release|Win32 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|Win32.Build.0 = Release|Win32 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.ActiveCfg = Release|x64 + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB}.Release|x64.Build.0 = Release|x64 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.ActiveCfg = Debug|Win32 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|Win32.Build.0 = Debug|Win32 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.ActiveCfg = Debug|x64 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Debug|x64.Build.0 = Debug|x64 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.ActiveCfg = Release|Win32 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|Win32.Build.0 = Release|Win32 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.ActiveCfg = Release|x64 + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1}.Release|x64.Build.0 = Release|x64 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.ActiveCfg = Debug|Win32 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|Win32.Build.0 = Debug|Win32 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.ActiveCfg = Debug|x64 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Debug|x64.Build.0 = Debug|x64 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.ActiveCfg = Release|Win32 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|Win32.Build.0 = Release|Win32 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.ActiveCfg = Release|x64 + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE}.Release|x64.Build.0 = Release|x64 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.ActiveCfg = Debug|Win32 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|Win32.Build.0 = Debug|Win32 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.ActiveCfg = Debug|x64 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Debug|x64.Build.0 = Debug|x64 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.ActiveCfg = Release|Win32 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Release|Win32.Build.0 = Release|Win32 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.ActiveCfg = Release|x64 + {E80DA7D4-AB22-4648-A068-327307156BE6}.Release|x64.Build.0 = Release|x64 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.ActiveCfg = Debug|Win32 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|Win32.Build.0 = Debug|Win32 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.ActiveCfg = Debug|x64 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Debug|x64.Build.0 = Debug|x64 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.ActiveCfg = Release|Win32 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|Win32.Build.0 = Release|Win32 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.ActiveCfg = Release|x64 + {B3B4AE3D-6D5A-4CF9-AF5B-43CF2131B958}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/examples/mandelbrot/Makefile b/examples/mandelbrot/Makefile new file mode 100644 index 00000000..dd369d0b --- /dev/null +++ b/examples/mandelbrot/Makefile @@ -0,0 +1,26 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -O3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 --target=sse4x2 + +default: mandelbrot + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ mandelbrot + +mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o -lm + +objs/%.o: %.cpp + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/mandelbrot.o: objs/mandelbrot_ispc.h + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/mandelbrot/mandelbrot.cpp b/examples/mandelbrot/mandelbrot.cpp new file mode 100644 index 00000000..2105a335 --- /dev/null +++ b/examples/mandelbrot/mandelbrot.cpp @@ -0,0 +1,117 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include "../timing.h" +#include "mandelbrot_ispc.h" +using namespace ispc; + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); +} + + +int main() { + unsigned int width = 768; + unsigned int height = 512; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + int maxIterations = 256; + int *buf = new int[width*height]; + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + printf("[mandelbrot ispc]:\t\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples/mandelbrot/mandelbrot.ispc b/examples/mandelbrot/mandelbrot.ispc new file mode 100644 index 00000000..ecbb4fc1 --- /dev/null +++ b/examples/mandelbrot/mandelbrot.ispc @@ -0,0 +1,76 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline int mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + +export void mandelbrot_ispc(uniform float x0, uniform float y0, + uniform float x1, uniform float y1, + uniform int width, uniform int height, + uniform int maxIterations, + reference uniform int output[]) +{ + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + + for (uniform int j = 0; j < height; j++) { + // Note that we'll be doing programCount computations in parallel, + // so increment i by that much. This assumes that width evenly + // divides programCount. + for (uniform int i = 0; i < width; i += programCount) { + // Figure out the position on the complex plane to compute the + // number of iterations at. Note that the x values are + // different across different program instances, since its + // initializer incorporates the value of the programIndex + // variable. + float x = x0 + (programIndex + i) * dx; + float y = y0 + j * dy; + + int index = j * width + i + programIndex; + output[index] = mandel(x, y, maxIterations); + } + } +} diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj new file mode 100755 index 00000000..db33453b --- /dev/null +++ b/examples/mandelbrot/mandelbrot.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1} + Win32Proj + mandelbrot + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + + + + + Document + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + + + + + + diff --git a/examples/mandelbrot/mandelbrot_serial.cpp b/examples/mandelbrot/mandelbrot_serial.cpp new file mode 100644 index 00000000..4bea7baf --- /dev/null +++ b/examples/mandelbrot/mandelbrot_serial.cpp @@ -0,0 +1,68 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static int mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + +void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]) +{ + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; ++i) { + float x = x0 + i * dx; + float y = y0 + j * dy; + + int index = (j * width + i); + output[index] = mandel(x, y, maxIterations); + } + } +} + diff --git a/examples/mandelbrot_tasks/Makefile b/examples/mandelbrot_tasks/Makefile new file mode 100644 index 00000000..182c2698 --- /dev/null +++ b/examples/mandelbrot_tasks/Makefile @@ -0,0 +1,38 @@ + +ARCH = $(shell uname) + +TASK_CXX=tasks_pthreads.cpp +TASK_LIB=-lpthread + +ifeq ($(ARCH), Darwin) + TASK_CXX=tasks_gcd.cpp + TASK_LIB= +endif + +TASK_OBJ=$(addprefix objs/, $(TASK_CXX:.cpp=.o)) + +CXX=g++ +CXXFLAGS=-Iobjs/ -O3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 --target=sse4x2 + +default: mandelbrot + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ mandelbrot + +mandelbrot: dirs objs/mandelbrot.o objs/mandelbrot_serial.o objs/mandelbrot_ispc.o $(TASK_OBJ) + $(CXX) $(CXXFLAGS) -o $@ objs/mandelbrot.o objs/mandelbrot_ispc.o objs/mandelbrot_serial.o $(TASK_OBJ) -lm $(TASK_LIB) + +objs/%.o: %.cpp + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/mandelbrot.o: objs/mandelbrot_ispc.h + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/mandelbrot_tasks/mandelbrot.cpp b/examples/mandelbrot_tasks/mandelbrot.cpp new file mode 100644 index 00000000..50ad4cf8 --- /dev/null +++ b/examples/mandelbrot_tasks/mandelbrot.cpp @@ -0,0 +1,120 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include "../timing.h" +#include "mandelbrot_ispc.h" +using namespace ispc; + +extern void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]); + +/* Write a PPM image file with the image of the Mandelbrot set */ +static void +writePPM(int *buf, int width, int height, const char *fn) { + FILE *fp = fopen(fn, "wb"); + fprintf(fp, "P6\n"); + fprintf(fp, "%d %d\n", width, height); + fprintf(fp, "255\n"); + for (int i = 0; i < width*height; ++i) { + // Map the iteration count to colors by just alternating between + // two greys. + char c = (buf[i] & 0x1) ? 240 : 20; + for (int j = 0; j < 3; ++j) + fputc(c, fp); + } + fclose(fp); +} + + +int main() { + unsigned int width = 1536; + unsigned int height = 1024; + float x0 = -2; + float x1 = 1; + float y0 = -1; + float y1 = 1; + + extern void TasksInit(); + TasksInit(); + + int maxIterations = 512; + int *buf = new int[width*height]; + + // + // Compute the image using the ispc implementation; report the minimum + // time of three runs. + // + double minISPC = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + mandelbrot_ispc(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minISPC = std::min(minISPC, dt); + } + + printf("[mandelbrot ispc+tasks]:\t[%.3f] million cycles\n", minISPC); + writePPM(buf, width, height, "mandelbrot-ispc.ppm"); + + // Clear out the buffer + for (unsigned int i = 0; i < width * height; ++i) + buf[i] = 0; + + // + // And run the serial implementation 3 times, again reporting the + // minimum time. + // + double minSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + mandelbrot_serial(x0, y0, x1, y1, width, height, maxIterations, buf); + double dt = get_elapsed_mcycles(); + minSerial = std::min(minSerial, dt); + } + + printf("[mandelbrot serial]:\t\t[%.3f] millon cycles\n", minSerial); + writePPM(buf, width, height, "mandelbrot-serial.ppm"); + + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minSerial/minISPC); + + return 0; +} diff --git a/examples/mandelbrot_tasks/mandelbrot.ispc b/examples/mandelbrot_tasks/mandelbrot.ispc new file mode 100644 index 00000000..df763e0a --- /dev/null +++ b/examples/mandelbrot_tasks/mandelbrot.ispc @@ -0,0 +1,86 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +static inline int +mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + + +/* Task to compute the Mandelbrot iterations for a span of scanlines from + [ystart,yend). + */ +task void +mandelbrot_scanlines(uniform int ystart, uniform int yend, + uniform float x0, uniform float dx, + uniform float y0, uniform float dy, + uniform int width, uniform int maxIterations, + reference uniform int output[]) { + for (uniform int j = ystart; j < yend; ++j) { + for (uniform int i = 0; i < width; i += programCount) { + float x = x0 + (programIndex + i) * dx; + float y = y0 + j * dy; + + int index = j * width + i + programIndex; + output[index] = mandel(x, y, maxIterations); + } + } +} + + +export void +mandelbrot_ispc(uniform float x0, uniform float y0, + uniform float x1, uniform float y1, + uniform int width, uniform int height, + uniform int maxIterations, reference uniform int output[]) { + uniform float dx = (x1 - x0) / width; + uniform float dy = (y1 - y0) / height; + + /* Launch task to compute results for spans of 'span' scanlines. */ + uniform int span = 2; + for (uniform int j = 0; j < height; j += span) + launch < mandelbrot_scanlines(j, j+span, x0, dx, y0, dy, width, + maxIterations, output) >; +} diff --git a/examples/mandelbrot_tasks/mandelbrot_serial.cpp b/examples/mandelbrot_tasks/mandelbrot_serial.cpp new file mode 100644 index 00000000..4bea7baf --- /dev/null +++ b/examples/mandelbrot_tasks/mandelbrot_serial.cpp @@ -0,0 +1,68 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +static int mandel(float c_re, float c_im, int count) { + float z_re = c_re, z_im = c_im; + int i; + for (i = 0; i < count; ++i) { + if (z_re * z_re + z_im * z_im > 4.) + break; + + float new_re = z_re*z_re - z_im*z_im; + float new_im = 2.f * z_re * z_im; + z_re = c_re + new_re; + z_im = c_im + new_im; + } + + return i; +} + +void mandelbrot_serial(float x0, float y0, float x1, float y1, + int width, int height, int maxIterations, + int output[]) +{ + float dx = (x1 - x0) / width; + float dy = (y1 - y0) / height; + + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; ++i) { + float x = x0 + i * dx; + float y = y0 + j * dy; + + int index = (j * width + i); + output[index] = mandel(x, y, maxIterations); + } + } +} + diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj new file mode 100755 index 00000000..ba3687cb --- /dev/null +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -0,0 +1,162 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E80DA7D4-AB22-4648-A068-327307156BE6} + Win32Proj + mandelbrot + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + + + + + + Document + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + + + + + + \ No newline at end of file diff --git a/examples/mandelbrot_tasks/tasks_concrt.cpp b/examples/mandelbrot_tasks/tasks_concrt.cpp new file mode 100644 index 00000000..a861ca87 --- /dev/null +++ b/examples/mandelbrot_tasks/tasks_concrt.cpp @@ -0,0 +1,115 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Simple task system implementation for ispc based on Microsoft's + Concurrency Runtime. */ + +#include +#include +using namespace Concurrency; +#include +#include + +// ispc expects these functions to have C linkage / not be mangled +extern "C" { + void ISPCLaunch(void *f, void *data); + void ISPCSync(); +} + +typedef void (*TaskFuncType)(void *, int, int); + +struct TaskInfo { + TaskFuncType ispcFunc; + void *ispcData; +}; + +// This is a simple implementation that just aborts if more than MAX_TASKS +// are launched. It could easily be extended to be more general... + +#define MAX_TASKS 4096 +static int taskOffset; +static TaskInfo taskInfo[MAX_TASKS]; +static event *events[MAX_TASKS]; +static CRITICAL_SECTION criticalSection; + +void +TasksInit() { + InitializeCriticalSection(&criticalSection); + for (int i = 0; i < MAX_TASKS; ++i) + events[i] = new event; +} + + +void __cdecl +lRunTask(LPVOID param) { + TaskInfo *ti = (TaskInfo *)param; + + // Actually run the task. + // FIXME: like the tasks_gcd.cpp implementation, this is passing bogus + // values for the threadIndex and threadCount builtins, which in turn + // will cause bugs in code that uses those. FWIW this example doesn't + // use them... + int threadIndex = 0; + int threadCount = 1; + ti->ispcFunc(ti->ispcData, threadIndex, threadCount); + + // Signal the event that this task is done + int taskNum = ti - &taskInfo[0]; + events[taskNum]->set(); +} + + +void +ISPCLaunch(void *func, void *data) { + // Get a TaskInfo struct for this task + EnterCriticalSection(&criticalSection); + TaskInfo *ti = &taskInfo[taskOffset++]; + assert(taskOffset < MAX_TASKS); + LeaveCriticalSection(&criticalSection); + + // And pass it on to the Concurrency Runtime... + ti->ispcFunc = (TaskFuncType)func; + ti->ispcData = data; + CurrentScheduler::ScheduleTask(lRunTask, ti); +} + + +void ISPCSync() { + event::wait_for_multiple(&events[0], taskOffset, true, + COOPERATIVE_TIMEOUT_INFINITE); + + for (int i = 0; i < taskOffset; ++i) + events[i]->reset(); + + taskOffset = 0; +} diff --git a/examples/mandelbrot_tasks/tasks_gcd.cpp b/examples/mandelbrot_tasks/tasks_gcd.cpp new file mode 100644 index 00000000..b8b8e80f --- /dev/null +++ b/examples/mandelbrot_tasks/tasks_gcd.cpp @@ -0,0 +1,90 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* A simple task system for ispc programs based on Apple's Grand Central + Dispatch. */ + +#include + +static dispatch_queue_t gcdQueue; +static dispatch_group_t gcdGroup; + +// ispc expects these functions to have C linkage / not be mangled +extern "C" { + void ISPCLaunch(void *f, void *data); + void ISPCSync(); +} + +struct TaskInfo { + void *func; + void *data; +}; + + +void +TasksInit() { + gcdQueue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0); + gcdGroup = dispatch_group_create(); +} + + +static void +lRunTask(void *ti) { + typedef void (*TaskFuncType)(void *, int, int); + TaskInfo *taskInfo = (TaskInfo *)ti; + + TaskFuncType func = (TaskFuncType)(taskInfo->func); + + // FIXME: these are bogus values; may cause bugs in code that depends + // on them having unique values in different threads. + int threadIndex = 0; + int threadCount = 1; + // Actually run the task + func(taskInfo->data, threadIndex, threadCount); + + // FIXME: taskInfo leaks... +} + + +void ISPCLaunch(void *func, void *data) { + TaskInfo *ti = new TaskInfo; + ti->func = func; + ti->data = data; + dispatch_group_async_f(gcdGroup, gcdQueue, ti, lRunTask); +} + + +void ISPCSync() { + // Wait for all of the tasks in the group to complete before returning + dispatch_group_wait(gcdGroup, DISPATCH_TIME_FOREVER); +} diff --git a/examples/mandelbrot_tasks/tasks_pthreads.cpp b/examples/mandelbrot_tasks/tasks_pthreads.cpp new file mode 100644 index 00000000..4a23c5dc --- /dev/null +++ b/examples/mandelbrot_tasks/tasks_pthreads.cpp @@ -0,0 +1,285 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// ispc expects these functions to have C linkage / not be mangled +extern "C" { + void ISPCLaunch(void *f, void *data); + void ISPCSync(); +} + + +static int nThreads; +static pthread_t *threads; +static pthread_mutex_t taskQueueMutex; +static std::vector > taskQueue; +static sem_t *workerSemaphore; +static uint32_t numUnfinishedTasks; +static pthread_mutex_t tasksRunningConditionMutex; +static pthread_cond_t tasksRunningCondition; + +static void *lTaskEntry(void *arg); + +/** Figure out how many CPU cores there are in the system + */ +static int +lNumCPUCores() { +#if defined(__linux__) + return sysconf(_SC_NPROCESSORS_ONLN); +#else + // Mac + int mib[2]; + mib[0] = CTL_HW; + size_t length = 2; + if (sysctlnametomib("hw.logicalcpu", mib, &length) == -1) { + fprintf(stderr, "sysctlnametomib() filed. Guessing 2 cores."); + return 2; + } + assert(length == 2); + + int nCores = 0; + size_t size = sizeof(nCores); + + if (sysctl(mib, 2, &nCores, &size, NULL, 0) == -1) { + fprintf(stderr, "sysctl() to find number of cores present failed. Guessing 2."); + return 2; + } + return nCores; +#endif +} + +void +TasksInit() { + nThreads = lNumCPUCores(); + + threads = new pthread_t[nThreads]; + + int err; + if ((err = pthread_mutex_init(&taskQueueMutex, NULL)) != 0) { + fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); + exit(1); + } + + char name[32]; + sprintf(name, "mandelbrot.%d", (int)getpid()); + workerSemaphore = sem_open(name, O_CREAT, S_IRUSR|S_IWUSR, 0); + if (!workerSemaphore) { + fprintf(stderr, "Error creating semaphore: %s\n", strerror(err)); + exit(1); + } + + if ((err = pthread_cond_init(&tasksRunningCondition, NULL)) != 0) { + fprintf(stderr, "Error creating condition variable: %s\n", strerror(err)); + exit(1); + } + + if ((err = pthread_mutex_init(&tasksRunningConditionMutex, NULL)) != 0) { + fprintf(stderr, "Error creating mutex: %s\n", strerror(err)); + exit(1); + } + + for (int i = 0; i < nThreads; ++i) { + err = pthread_create(&threads[i], NULL, &lTaskEntry, reinterpret_cast(i)); + if (err != 0) { + fprintf(stderr, "Error creating pthread %d: %s\n", i, strerror(err)); + exit(1); + } + } +} + + +void +ISPCLaunch(void *f, void *d) { + // + // Acquire mutex, add task + // + int err; + if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + taskQueue.push_back(std::make_pair(f, d)); + + if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + + // + // Update count of number of tasks left to run + // + if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + ++numUnfinishedTasks; + + if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + // + // Post to the worker semaphore to wake up worker threads that are + // sleeping waiting for tasks to show up + // + if ((err = sem_post(workerSemaphore)) != 0) { + fprintf(stderr, "Error from sem_post: %s\n", strerror(err)); + exit(1); + } +} + + +static void * +lTaskEntry(void *arg) { + int threadIndex = int(reinterpret_cast(arg)); + int threadCount = nThreads; + + while (true) { + int err; + if ((err = sem_wait(workerSemaphore)) != 0) { + fprintf(stderr, "Error from sem_wait: %s\n", strerror(err)); + exit(1); + } + + std::pair myTask; + // + // Acquire mutex, get task + // + if ((err = pthread_mutex_lock(&taskQueueMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + if (taskQueue.size() == 0) { + // + // Task queue is empty, go back and wait on the semaphore + // + if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + continue; + } + + myTask = taskQueue.back(); + taskQueue.pop_back(); + + if ((err = pthread_mutex_unlock(&taskQueueMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_unlock: %s\n", strerror(err)); + exit(1); + } + + // + // Do work for _myTask_ + // + typedef void (*TaskFunType)(void *, int, int); + TaskFunType func = (TaskFunType)myTask.first; + func(myTask.second, threadIndex, threadCount); + + // + // Decrement the number of unfinished tasks counter + // + if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + int unfinished = --numUnfinishedTasks; + if (unfinished == 0) { + // + // Signal the "no more tasks are running" condition if all of + // them are done. + // + int err; + if ((err = pthread_cond_signal(&tasksRunningCondition)) != 0) { + fprintf(stderr, "Error from pthread_cond_signal: %s\n", strerror(err)); + exit(1); + } + } + + if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + } + + pthread_exit(NULL); + return 0; +} + + +void ISPCSync() { + int err; + if ((err = pthread_mutex_lock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } + + // As long as there are tasks running, wait on the condition variable; + // doing so causes this thread to go to sleep until someone signals on + // the tasksRunningCondition condition variable. + while (numUnfinishedTasks > 0) { + if ((err = pthread_cond_wait(&tasksRunningCondition, + &tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_cond_wait: %s\n", strerror(err)); + exit(1); + } + } + + // We acquire ownership of the condition variable mutex when the above + // pthread_cond_wait returns. + // FIXME: is there a lurking issue here if numUnfinishedTasks gets back + // to zero by the time we get to ISPCSync() and thence we're trying to + // unlock a mutex we don't have a lock on? + if ((err = pthread_mutex_unlock(&tasksRunningConditionMutex)) != 0) { + fprintf(stderr, "Error from pthread_mutex_lock: %s\n", strerror(err)); + exit(1); + } +} diff --git a/examples/options/Makefile b/examples/options/Makefile new file mode 100644 index 00000000..46be29e2 --- /dev/null +++ b/examples/options/Makefile @@ -0,0 +1,26 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -g -Wall +ISPC=ispc +ISPCFLAGS=-O2 --target=sse4x2 + +default: options + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ options + +options: dirs objs/options.o objs/options_serial.o objs/options_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/options.o objs/options_ispc.o objs/options_serial.o -lm + +objs/%.o: %.cpp + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/options.o: objs/options_ispc.h options_defs.h + +objs/%_ispc.h objs/%_ispc.o: %.ispc options_defs.h + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/options/options.cpp b/examples/options/options.cpp new file mode 100644 index 00000000..241b32be --- /dev/null +++ b/examples/options/options.cpp @@ -0,0 +1,151 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include +#include +#ifndef __APPLE__ +#include +#endif // !__APPLE__ +using std::max; + +#include "options_defs.h" +#include "../timing.h" + +#include "options_ispc.h" +using namespace ispc; + +// Allocate memory with 64-byte alignment. +float *AllocFloats(int count) { + int size = count * sizeof(float); +#if defined(_WIN32) || defined(_WIN64) + return (float *)_aligned_malloc(size, 64); +#elif defined (__APPLE__) + // Allocate excess memory to ensure an aligned pointer can be returned + void *mem = malloc(size + (64-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem += 64 - (reinterpret_cast(amem) & (64 - 1)); + ((void**)amem)[-1] = mem; + return (float *)amem; +#else + return (float *)memalign(64, size); +#endif +} + +extern void black_scholes_serial(float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count); + +extern void binomial_put_serial(float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count); + +int main() { + // Pointers passed to ispc code must have alignment of the target's + // vector width at minimum. + float *S = AllocFloats(N_OPTIONS); + float *X = AllocFloats(N_OPTIONS); + float *T = AllocFloats(N_OPTIONS); + float *r = AllocFloats(N_OPTIONS); + float *v = AllocFloats(N_OPTIONS); + float *result = AllocFloats(N_OPTIONS); + + for (int i = 0; i < N_OPTIONS; ++i) { + S[i] = 100; // stock price + X[i] = 98; // option strike price + T[i] = 2; // time (years) + r[i] = .02; // risk-free interest rate + v[i] = 5; // volatility + } + + // + // Binomial options pricing model, ispc implementation + // + reset_and_start_timer(); + binomial_put_ispc(S, X, T, r, v, result, N_OPTIONS); + double binomial_ispc = get_elapsed_mcycles(); + float sum = 0.f; + for (int i = 0; i < N_OPTIONS; ++i) + sum += result[i]; + printf("[binomial ispc]:\t\t[%.3f] million cycles (avg %f)\n", + binomial_ispc, sum / N_OPTIONS); + + // + // Binomial options, serial implementation + // + reset_and_start_timer(); + binomial_put_serial(S, X, T, r, v, result, N_OPTIONS); + double binomial_serial = get_elapsed_mcycles(); + sum = 0.f; + for (int i = 0; i < N_OPTIONS; ++i) + sum += result[i]; + printf("[binomial serial]:\t\t[%.3f] million cycles (avg %f)\n", + binomial_serial, sum / N_OPTIONS); + + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", binomial_serial / binomial_ispc); + + // + // Black-Scholes options pricing model, ispc implementation + // + sum = 0.f; + reset_and_start_timer(); + for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) { + black_scholes_ispc(S, X, T, r, v, result, N_OPTIONS); + for (int i = 0; i < N_OPTIONS; ++i) + sum += result[i]; + } + double bs_ispc = get_elapsed_mcycles(); + printf("[black-scholes ispc]:\t\t[%.3f] million cycles (avg %f)\n", + bs_ispc, sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS)); + + // + // Black-Scholes options pricing model, serial implementation + // + sum = 0.f; + reset_and_start_timer(); + for (int a = 0; a < N_BLACK_SCHOLES_ROUNDS; ++a) { + black_scholes_serial(S, X, T, r, v, result, N_OPTIONS); + for (int i = 0; i < N_OPTIONS; ++i) + sum += result[i]; + } + double bs_serial = get_elapsed_mcycles(); + printf("[black-scholes serial]:\t\t[%.3f] million cycles (avg %f)\n", bs_serial, + sum / (N_BLACK_SCHOLES_ROUNDS * N_OPTIONS)); + + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", bs_serial / bs_ispc); + + return 0; +} diff --git a/examples/options/options.ispc b/examples/options/options.ispc new file mode 100644 index 00000000..89e53634 --- /dev/null +++ b/examples/options/options.ispc @@ -0,0 +1,103 @@ +// -*- mode: c++ -*- +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include "options_defs.h" + +// Cumulative normal distribution function +static inline float +CND(float X) { + float L = abs(X); + + float k = 1.0 / (1.0 + 0.2316419 * L); + float k2 = k*k; + float k3 = k2*k; + float k4 = k2*k2; + float k5 = k3*k2; + + const float invSqrt2Pi = 0.39894228040f; + float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 + + -1.821255978f * k4 + 1.330274429f * k5); + w *= invSqrt2Pi * exp(-L * L * .5f); + + if (X > 0.f) + w = 1.0 - w; + return w; +} + +export void +black_scholes_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], + uniform float ra[], uniform float va[], + uniform float result[], uniform int count) { + for (uniform int i = 0; i < count; i += programCount) { + float S = Sa[i + programIndex], X = Xa[i + programIndex]; + float T = Ta[i + programIndex], r = ra[i + programIndex]; + float v = va[i + programIndex]; + + float d1 = (log(S/X) + (r + v * v * .5f) * T) / (v * sqrt(T)); + float d2 = d1 - v * sqrt(T); + + result[i + programIndex] = S * CND(d1) - X * exp(-r * T) * CND(d2); + } +} + + +export void +binomial_put_ispc(uniform float Sa[], uniform float Xa[], uniform float Ta[], + uniform float ra[], uniform float va[], + uniform float result[], uniform int count) { + float V[BINOMIAL_NUM]; + + for (uniform int i = 0; i < count; i += programCount) { + float S = Sa[i + programIndex], X = Xa[i + programIndex]; + float T = Ta[i + programIndex], r = ra[i + programIndex]; + float v = va[i + programIndex]; + + float dt = T / BINOMIAL_NUM; + float u = exp(v * sqrt(dt)); + float d = 1. / u; + float disc = exp(r * dt); + float Pu = (disc - d) / (u - d); + + for (uniform int j = 0; j < BINOMIAL_NUM; ++j) { + float upow = pow(u, (float)(2*j-BINOMIAL_NUM)); + V[j] = max(0., X - S * upow); + } + + for (uniform int j = BINOMIAL_NUM-1; j >= 0; --j) + for (uniform int k = 0; k < j; ++k) + V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; + + result[i + programIndex] = V[0]; + } +} diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj new file mode 100755 index 00000000..5b8f709b --- /dev/null +++ b/examples/options/options.vcxproj @@ -0,0 +1,168 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE} + Win32Proj + options + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + 4305 + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + 4305 + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + 4305 + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + 4305 + + + Console + true + true + true + + + + + + + + + Document + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=sse4x2 + + cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --target=sse4x2 + + %(Filename).obj;%(Filename)_ispc.h + %(Filename).obj;%(Filename)_ispc.h + + + + + + + + + diff --git a/examples/options/options_defs.h b/examples/options/options_defs.h new file mode 100644 index 00000000..54b8ec81 --- /dev/null +++ b/examples/options/options_defs.h @@ -0,0 +1,42 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef OPTIONS_DEFS_H +#define OPTIONS_DEFS_H 1 + +#define BINOMIAL_NUM 64 +#define N_OPTIONS 65536 +#define N_BLACK_SCHOLES_ROUNDS 20 + + +#endif // OPTIONS_DEFS_H diff --git a/examples/options/options_serial.cpp b/examples/options/options_serial.cpp new file mode 100644 index 00000000..a2689b73 --- /dev/null +++ b/examples/options/options_serial.cpp @@ -0,0 +1,114 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include "options_defs.h" +#include +#include + +// Cumulative normal distribution function +static inline float +CND(float X) { + float L = fabsf(X); + + float k = 1.0 / (1.0 + 0.2316419 * L); + float k2 = k*k; + float k3 = k2*k; + float k4 = k2*k2; + float k5 = k3*k2; + + const float invSqrt2Pi = 0.39894228040f; + float w = (0.31938153f * k - 0.356563782f * k2 + 1.781477937f * k3 + + -1.821255978f * k4 + 1.330274429f * k5); + w *= invSqrt2Pi * expf(-L * L * .5f); + + if (X > 0.f) + w = 1.0 - w; + return w; +} + + +void +black_scholes_serial(float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count) { + for (int i = 0; i < count; ++i) { + float S = Sa[i], X = Xa[i]; + float T = Ta[i], r = ra[i]; + float v = va[i]; + + float d1 = (logf(S/X) + (r + v * v * .5f) * T) / (v * sqrtf(T)); + float d2 = d1 - v * sqrtf(T); + + result[i] = S * CND(d1) - X * expf(-r * T) * CND(d2); + } +} + + +void +binomial_put_serial(float Sa[], float Xa[], float Ta[], + float ra[], float va[], + float result[], int count) { + float V[BINOMIAL_NUM]; + + for (int i = 0; i < count; ++i) { + float S = Sa[i], X = Xa[i]; + float T = Ta[i], r = ra[i]; + float v = va[i]; + + float dt = T / BINOMIAL_NUM; + float u = expf(v * sqrtf(dt)); + float d = 1. / u; + float disc = expf(r * dt); + float Pu = (disc - d) / (u - d); + + for (int j = 0; j < BINOMIAL_NUM; ++j) { + float upow = powf(u, (float)(2*j-BINOMIAL_NUM)); + V[j] = std::max(0.f, X - S * upow); + } + + for (int j = BINOMIAL_NUM-1; j >= 0; --j) + for (int k = 0; k < j; ++k) + V[k] = ((1 - Pu) * V[k] + Pu * V[k + 1]) / disc; + + result[i] = V[0]; + } +} + + diff --git a/examples/rt/Makefile b/examples/rt/Makefile new file mode 100644 index 00000000..7df58868 --- /dev/null +++ b/examples/rt/Makefile @@ -0,0 +1,24 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -O3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 --target=sse4x2 + +default: rt + +.PHONY: dirs clean + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ rt + +rt: dirs objs/rt.o objs/rt_serial.o objs/rt_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/rt.o objs/rt_ispc.o objs/rt_serial.o -lm + +objs/%.o: %.cpp objs/rt_ispc.h + $(CXX) $< $(CXXFLAGS) -c -o $@ + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/rt/cornell.bvh b/examples/rt/cornell.bvh new file mode 100644 index 00000000..f7e0f3dd Binary files /dev/null and b/examples/rt/cornell.bvh differ diff --git a/examples/rt/cornell.camera b/examples/rt/cornell.camera new file mode 100644 index 00000000..0fec1642 Binary files /dev/null and b/examples/rt/cornell.camera differ diff --git a/examples/rt/rt.cpp b/examples/rt/rt.cpp new file mode 100644 index 00000000..e589bd94 --- /dev/null +++ b/examples/rt/rt.cpp @@ -0,0 +1,244 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include +#include +#include +#include +#include +#ifndef __APPLE__ +#include +#endif +#include "../timing.h" +#include "rt_ispc.h" + +using namespace ispc; + +typedef unsigned int uint; + +template +T *AllocAligned(int count) { + int size = count * sizeof(T); +#if defined(_WIN32) || defined(_WIN64) + return (T *)_aligned_malloc(size, 64); +#elif defined (__APPLE__) + // Allocate excess memory to ensure an aligned pointer can be returned + void *mem = malloc(size + (64-1) + sizeof(void*)); + char *amem = ((char*)mem) + sizeof(void*); + amem += 64 - (reinterpret_cast(amem) & (64 - 1)); + ((void**)amem)[-1] = mem; + return (T *)amem; +#else + return (T *)memalign(64, size); +#endif +} + +extern void raytrace_serial(int width, int height, const float raster2camera[4][4], + const float camera2world[4][4], float image[], + int id[], const LinearBVHNode nodes[], + const Triangle triangles[]); + + +static void writeImage(int *idImage, float *depthImage, int width, int height, + const char *filename) { + FILE *f = fopen(filename, "wb"); + if (!f) { + perror(filename); + exit(1); + } + + fprintf(f, "P6\n%d %d\n255\n", width, height); + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + // use the bits from the object id of the hit object to make a + // random color + int id = idImage[y * width + x]; + unsigned char r = 0, g = 0, b = 0; + + for (int i = 0; i < 8; ++i) { + // extract bit 3*i for red, 3*i+1 for green, 3*i+2 for blue + int rbit = (id & (1 << (3*i))) >> (3*i); + int gbit = (id & (1 << (3*i+1))) >> (3*i+1); + int bbit = (id & (1 << (3*i+2))) >> (3*i+2); + // and then set the bits of the colors starting from the + // high bits... + r |= rbit << (7-i); + g |= gbit << (7-i); + b |= bbit << (7-i); + } + fputc(r, f); + fputc(g, f); + fputc(b, f); + } + } + fclose(f); +} + + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "usage: rt \n"); + exit(1); + } + +#define READ(var, n) \ + if (fread(&(var), sizeof(var), n, f) != (unsigned int)n) { \ + fprintf(stderr, "Unexpected EOF reading scene file\n"); \ + return 1; \ + } else /* eat ; */ + + // + // Read the camera specification information from the camera file + // + char fnbuf[1024]; + sprintf(fnbuf, "%s.camera", argv[1]); + FILE *f = fopen(fnbuf, "rb"); + if (!f) { + perror(argv[1]); + return 1; + } + + // + // Nothing fancy, and trouble if we run on a big-endian system, just + // fread in the bits + // + int width, height; + float camera2world[4][4], raster2camera[4][4]; + READ(width, 1); + READ(height, 1); + READ(camera2world[0][0], 16); + READ(raster2camera[0][0], 16); + + // + // Read in the serialized BVH + // + sprintf(fnbuf, "%s.bvh", argv[1]); + f = fopen(fnbuf, "rb"); + if (!f) { + perror(argv[2]); + return 1; + } + + // The BVH file starts with an int that gives the total number of BVH + // nodes + uint nNodes; + READ(nNodes, 1); + + LinearBVHNode *nodes = AllocAligned(nNodes); + for (unsigned int i = 0; i < nNodes; ++i) { + // Each node is 6x floats for a boox, then an integer for an offset + // to the second child node, then an integer that encodes the type + // of node, the total number of int it if a leaf node, etc. + float b[6]; + READ(b[0], 6); + nodes[i].bounds[0].v[0] = b[0]; + nodes[i].bounds[0].v[1] = b[1]; + nodes[i].bounds[0].v[2] = b[2]; + nodes[i].bounds[1].v[0] = b[3]; + nodes[i].bounds[1].v[1] = b[4]; + nodes[i].bounds[1].v[2] = b[5]; + READ(nodes[i].offset, 1); + READ(nodes[i].primsAxis, 1); + } + + // And then read the triangles + uint nTris; + READ(nTris, 1); + Triangle *triangles = AllocAligned(nTris); + for (uint i = 0; i < nTris; ++i) { + // 9x floats for the 3 vertices + float v[9]; + READ(v[0], 9); + float *vp = v; + for (int j = 0; j < 3; ++j) { + triangles[i].p[j].v[0] = *vp++; + triangles[i].p[j].v[1] = *vp++; + triangles[i].p[j].v[2] = *vp++; + } + // And create an object id + triangles[i].id = i+1; + } + fclose(f); + + // round image resolution up to multiple of 4 to makethings easy for + // the code that assigns pixels to ispc program instances + height = (height + 3) & ~3; + width = (width + 3) & ~3; + + // allocate images; one to hold hit object ids, one to hold depth to + // the first interseciton + int *id = new int[width*height]; + float *image = new float[width*height]; + + // + // Run 3 iterations with ispc, record the minimum time + // + double minTimeISPC = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + raytrace(width, height, raster2camera, camera2world, + image, id, nodes, triangles); + double dt = get_elapsed_mcycles(); + minTimeISPC = std::min(dt, minTimeISPC); + } + printf("[rt ispc]:\t\t\t[%.3f] million cycles for %d x %d image\n", minTimeISPC, width, height); + + writeImage(id, image, width, height, "rt-ispc.ppm"); + + // + // And 3 iterations with the serial implementation, reporting the + // minimum time. + // + double minTimeSerial = 1e30; + for (int i = 0; i < 3; ++i) { + reset_and_start_timer(); + raytrace_serial(width, height, raster2camera, camera2world, + image, id, nodes, triangles); + double dt = get_elapsed_mcycles(); + minTimeSerial = std::min(dt, minTimeSerial); + } + printf("[rt serial]:\t\t\t[%.3f] million cycles for %d x %d image\n", + minTimeSerial, width, height); + printf("\t\t\t\t(%.2fx speedup from ISPC)\n", minTimeSerial / minTimeISPC); + + writeImage(id, image, width, height, "rt-serial.ppm"); + + return 0; +} diff --git a/examples/rt/rt.ispc b/examples/rt/rt.ispc new file mode 100644 index 00000000..08dabb0e --- /dev/null +++ b/examples/rt/rt.ispc @@ -0,0 +1,273 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define bool int + +typedef float<3> float3; + +struct Ray { + float3 origin, dir, invDir; + uniform unsigned int dirIsNeg[3]; + float mint, maxt; + int hitId; +}; + +struct Triangle { + uniform float3 p[3]; + uniform int id; +}; + +struct LinearBVHNode { + uniform float3 bounds[2]; + uniform unsigned int offset; // num primitives for leaf, second child for interior + uniform unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding +}; + +static inline uniform int nPrims(const reference LinearBVHNode node) { + return (node.primsAxis & 0xff); +} + +static inline uniform int axis(const reference LinearBVHNode node) { + return ((node.primsAxis >> 8) & 0xff); +} + +static inline uniform bool isInterior(const reference LinearBVHNode node) { + return nPrims(node) == 0; +} + +static inline float3 Cross(const float3 v1, const float3 v2) { + float v1x = v1.x, v1y = v1.y, v1z = v1.z; + float v2x = v2.x, v2y = v2.y, v2z = v2.z; + float3 ret; + ret.x = (v1y * v2z) - (v1z * v2y); + ret.y = (v1z * v2x) - (v1x * v2z); + ret.z = (v1x * v2y) - (v1y * v2x); + return ret; +} + +static inline float Dot(const float3 a, const float3 b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + + +static void generateRay(uniform const float raster2camera[4][4], + uniform const float camera2world[4][4], + float x, float y, reference Ray ray) { + ray.mint = 0.f; + ray.maxt = 1e30f; + + ray.hitId = 0; + + // transform raster coordinate (x, y, 0) to camera space + float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3]; + float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3]; + float camz = raster2camera[2][3]; + float camw = raster2camera[3][3]; + camx /= camw; + camy /= camw; + camz /= camw; + + ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz; + ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz; + ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz; + + ray.origin.x = camera2world[0][3] / camera2world[3][3]; + ray.origin.y = camera2world[1][3] / camera2world[3][3]; + ray.origin.z = camera2world[2][3] / camera2world[3][3]; + + ray.invDir = 1.f / ray.dir; + + ray.dirIsNeg[0] = any(ray.invDir.x < 0) ? 1 : 0; + ray.dirIsNeg[1] = any(ray.invDir.y < 0) ? 1 : 0; + ray.dirIsNeg[2] = any(ray.invDir.z < 0) ? 1 : 0; +} + + +static inline bool BBoxIntersect(const reference uniform float3 bounds[2], + const reference Ray ray) { + float t0 = ray.mint, t1 = ray.maxt; + + // Check all three axis-aligned slabs. Don't try to early out; it's + // not worth the trouble + float3 tNear = (bounds[0] - ray.origin) * ray.invDir; + float3 tFar = (bounds[1] - ray.origin) * ray.invDir; + if (tNear.x > tFar.x) { + float tmp = tNear.x; + tNear.x = tFar.x; + tFar.x = tmp; + } + t0 = max(tNear.x, t0); + t1 = min(tFar.x, t1); + + if (tNear.y > tFar.y) { + float tmp = tNear.y; + tNear.y = tFar.y; + tFar.y = tmp; + } + t0 = max(tNear.y, t0); + t1 = min(tFar.y, t1); + + if (tNear.z > tFar.z) { + float tmp = tNear.z; + tNear.z = tFar.z; + tFar.z = tmp; + } + t0 = max(tNear.z, t0); + t1 = min(tFar.z, t1); + + return (t0 <= t1); +} + + + +static inline bool TriIntersect(const reference Triangle tri, reference Ray ray) { + uniform float3 e1 = tri.p[1] - tri.p[0]; + uniform float3 e2 = tri.p[2] - tri.p[0]; + + float3 s1 = Cross(ray.dir, e2); + float divisor = Dot(s1, e1); + bool hit = true; + + if (divisor == 0.) + hit = false; + float invDivisor = 1.f / divisor; + + // Compute first barycentric coordinate + float3 d = ray.origin - tri.p[0]; + float b1 = Dot(d, s1) * invDivisor; + if (b1 < 0. || b1 > 1.) + hit = false; + + // Compute second barycentric coordinate + float3 s2 = Cross(d, e1); + float b2 = Dot(ray.dir, s2) * invDivisor; + if (b2 < 0. || b1 + b2 > 1.) + hit = false; + + // Compute _t_ to intersection point + float t = Dot(e2, s2) * invDivisor; + if (t < ray.mint || t > ray.maxt) + hit = false; + + if (hit) { + ray.maxt = t; + ray.hitId = tri.id; + } + return hit; +} + + +bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], + reference Ray r) { + Ray ray = r; + bool hit = false; + // Follow ray through BVH nodes to find primitive intersections + uniform int todoOffset = 0, nodeNum = 0; + uniform int todo[64]; + + while (true) { + // Check ray against BVH node + LinearBVHNode node = nodes[nodeNum]; + if (any(BBoxIntersect(node.bounds, ray))) { + uniform unsigned int nPrimitives = nPrims(node); + if (nPrimitives > 0) { + // Intersect ray with primitives in leaf BVH node + uniform unsigned int primitivesOffset = node.offset; + for (uniform unsigned int i = 0; i < nPrimitives; ++i) { + if (TriIntersect(tris[primitivesOffset+i], ray)) + hit = true; + } + if (todoOffset == 0) + break; + nodeNum = todo[--todoOffset]; + } + else { + // Put far BVH node on _todo_ stack, advance to near node + if (r.dirIsNeg[axis(node)]) { + todo[todoOffset++] = nodeNum + 1; + nodeNum = node.offset; + } + else { + todo[todoOffset++] = node.offset; + nodeNum = nodeNum + 1; + } + } + } + else { + if (todoOffset == 0) + break; + nodeNum = todo[--todoOffset]; + } + } + r.maxt = ray.maxt; + r.hitId = ray.hitId; + + return hit; +} + + +export void raytrace(uniform int width, uniform int height, + const uniform float raster2camera[4][4], + const uniform float camera2world[4][4], + uniform float image[], uniform int id[], + const LinearBVHNode nodes[], + const Triangle triangles[]) { + static const uniform float udx[16] = { 0, 1, 0, 1, 2, 3, 2, 3, + 0, 1, 0, 1, 2, 3, 2, 3 }; + static const uniform float udy[16] = { 0, 0, 1, 1, 0, 0, 1, 1, + 2, 2, 3, 3, 2, 2, 3, 3 }; + + // The outer loops are always over blocks of 4x4 pixels + for (uniform int y = 0; y < height; y += 4) { + for (uniform int x = 0; x < width; x += 4) { + // Now we have a block of 4x4=16 pixels to process; it will + // take 16/programCount iterations of this loop to process + // them. + for (uniform int o = 0; o < 16 / programCount; ++o) { + // Map program instances to samples in the udx/udy arrays + // to figure out which pixel each program instance is + // responsible for + const float dx = udx[o * programCount + programIndex]; + const float dy = udy[o * programCount + programIndex]; + + Ray ray; + generateRay(raster2camera, camera2world, x+dx, y+dy, ray); + BVHIntersect(nodes, triangles, ray); + + int offset = (y + (int)dy) * width + (x + (int)dx); + image[offset] = ray.maxt; + id[offset] = ray.hitId; + } + } + } +} diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj new file mode 100755 index 00000000..4a893a8f --- /dev/null +++ b/examples/rt/rt.vcxproj @@ -0,0 +1,165 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9} + Win32Proj + rt + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Document + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + + + + + + + + + + diff --git a/examples/rt/rt_serial.cpp b/examples/rt/rt_serial.cpp new file mode 100644 index 00000000..53f7d4cb --- /dev/null +++ b/examples/rt/rt_serial.cpp @@ -0,0 +1,288 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef _MSC_VER +#define _CRT_SECURE_NO_WARNINGS +#define NOMINMAX +#pragma warning (disable: 4244) +#pragma warning (disable: 4305) +#endif + +#include + +// Just enough of a float3 class to do what we need in this file. +#ifdef _MSC_VER +__declspec(align(16)) +#endif +struct float3 { + float3() { } + float3(float xx, float yy, float zz) { x = xx; y = yy; z = zz; } + + float3 operator*(float f) const { return float3(x*f, y*f, z*f); } + float3 operator-(const float3 &f2) const { + return float3(x-f2.x, y-f2.y, z-f2.z); + } + float3 operator*(const float3 &f2) const { + return float3(x*f2.x, y*f2.y, z*f2.z); + } + float x, y, z; + float pad; // match padding/alignment of ispc version +} +#ifndef _MSC_VER +__attribute__ ((aligned(16))) +#endif +; + +struct Ray { + float3 origin, dir, invDir; + unsigned int dirIsNeg[3]; + float mint, maxt; + int hitId; +}; + + +// Declare these in a namespace so the mangling matches +namespace ispc { + struct Triangle { + float3 p[3]; + int id; + }; + + struct LinearBVHNode { + float3 bounds[2]; + unsigned int offset; // primitives for leaf, second child for interior + unsigned int primsAxis; // 0:7 nPrimitives, 8:15 split axis, 16:31 padding + }; +} + +using namespace ispc; + +inline int nPrims(const LinearBVHNode &node) { + return (node.primsAxis & 0xff); +} + +inline int axis(const LinearBVHNode &node) { + return ((node.primsAxis >> 8) & 0xff); +} + +inline bool isInterior(const LinearBVHNode &node) { + return nPrims(node) == 0; +} + +inline float3 Cross(const float3 &v1, const float3 &v2) { + float v1x = v1.x, v1y = v1.y, v1z = v1.z; + float v2x = v2.x, v2y = v2.y, v2z = v2.z; + float3 ret; + ret.x = (v1y * v2z) - (v1z * v2y); + ret.y = (v1z * v2x) - (v1x * v2z); + ret.z = (v1x * v2y) - (v1y * v2x); + return ret; +} + +inline float Dot(const float3 &a, const float3 &b) { + return a.x * b.x + a.y * b.y + a.z * b.z; +} + + +static void generateRay(const float raster2camera[4][4], + const float camera2world[4][4], + float x, float y, Ray &ray) { + ray.mint = 0.f; + ray.maxt = 1e30f; + + ray.hitId = 0; + + // transform raster coordinate (x, y, 0) to camera space + float camx = raster2camera[0][0] * x + raster2camera[0][1] * y + raster2camera[0][3]; + float camy = raster2camera[1][0] * x + raster2camera[1][1] * y + raster2camera[1][3]; + float camz = raster2camera[2][3]; + float camw = raster2camera[3][3]; + camx /= camw; + camy /= camw; + camz /= camw; + + ray.dir.x = camera2world[0][0] * camx + camera2world[0][1] * camy + camera2world[0][2] * camz; + ray.dir.y = camera2world[1][0] * camx + camera2world[1][1] * camy + camera2world[1][2] * camz; + ray.dir.z = camera2world[2][0] * camx + camera2world[2][1] * camy + camera2world[2][2] * camz; + + ray.origin.x = camera2world[0][3] / camera2world[3][3]; + ray.origin.y = camera2world[1][3] / camera2world[3][3]; + ray.origin.z = camera2world[2][3] / camera2world[3][3]; + + ray.invDir.x = 1.f / ray.dir.x; + ray.invDir.y = 1.f / ray.dir.y; + ray.invDir.z = 1.f / ray.dir.z; + + ray.dirIsNeg[0] = (ray.invDir.x < 0) ? 1 : 0; + ray.dirIsNeg[1] = (ray.invDir.y < 0) ? 1 : 0; + ray.dirIsNeg[2] = (ray.invDir.z < 0) ? 1 : 0; +} + + +static inline bool BBoxIntersect(const float3 bounds[2], + const Ray &ray) { + float t0 = ray.mint, t1 = ray.maxt; + + float3 tNear = (bounds[0] - ray.origin) * ray.invDir; + float3 tFar = (bounds[1] - ray.origin) * ray.invDir; + if (tNear.x > tFar.x) { + float tmp = tNear.x; + tNear.x = tFar.x; + tFar.x = tmp; + } + t0 = std::max(tNear.x, t0); + t1 = std::min(tFar.x, t1); + + if (tNear.y > tFar.y) { + float tmp = tNear.y; + tNear.y = tFar.y; + tFar.y = tmp; + } + t0 = std::max(tNear.y, t0); + t1 = std::min(tFar.y, t1); + + if (tNear.z > tFar.z) { + float tmp = tNear.z; + tNear.z = tFar.z; + tFar.z = tmp; + } + t0 = std::max(tNear.z, t0); + t1 = std::min(tFar.z, t1); + + return (t0 <= t1); +} + + + +inline bool TriIntersect(const Triangle &tri, Ray &ray) { + float3 e1 = tri.p[1] - tri.p[0]; + float3 e2 = tri.p[2] - tri.p[0]; + + float3 s1 = Cross(ray.dir, e2); + float divisor = Dot(s1, e1); + + if (divisor == 0.) + return false; + float invDivisor = 1.f / divisor; + + // Compute first barycentric coordinate + float3 d = ray.origin - tri.p[0]; + float b1 = Dot(d, s1) * invDivisor; + if (b1 < 0. || b1 > 1.) + return false; + + // Compute second barycentric coordinate + float3 s2 = Cross(d, e1); + float b2 = Dot(ray.dir, s2) * invDivisor; + if (b2 < 0. || b1 + b2 > 1.) + return false; + + // Compute _t_ to intersection point + float t = Dot(e2, s2) * invDivisor; + if (t < ray.mint || t > ray.maxt) + return false; + + ray.maxt = t; + ray.hitId = tri.id; + return true; +} + + +bool BVHIntersect(const LinearBVHNode nodes[], const Triangle tris[], + Ray &r) { + Ray ray = r; + bool hit = false; + // Follow ray through BVH nodes to find primitive intersections + int todoOffset = 0, nodeNum = 0; + int todo[64]; + + while (true) { + // Check ray against BVH node + const LinearBVHNode &node = nodes[nodeNum]; + if (BBoxIntersect(node.bounds, ray)) { + unsigned int nPrimitives = nPrims(node); + if (nPrimitives > 0) { + // Intersect ray with primitives in leaf BVH node + unsigned int primitivesOffset = node.offset; + for (unsigned int i = 0; i < nPrimitives; ++i) { + if (TriIntersect(tris[primitivesOffset+i], ray)) + hit = true; + } + if (todoOffset == 0) + break; + nodeNum = todo[--todoOffset]; + } + else { + // Put far BVH node on _todo_ stack, advance to near node + if (r.dirIsNeg[axis(node)]) { + todo[todoOffset++] = nodeNum + 1; + nodeNum = node.offset; + } + else { + todo[todoOffset++] = node.offset; + nodeNum = nodeNum + 1; + } + } + } + else { + if (todoOffset == 0) + break; + nodeNum = todo[--todoOffset]; + } + } + r.maxt = ray.maxt; + r.hitId = ray.hitId; + + return hit; +} + + +void raytrace_serial(int width, int height, + const float raster2camera[4][4], + const float camera2world[4][4], + float image[], + int id[], + const LinearBVHNode nodes[], + const Triangle triangles[]) { + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + Ray ray; + generateRay(raster2camera, camera2world, x, y, ray); + BVHIntersect(nodes, triangles, ray); + + int offset = y * width + x; + image[offset] = ray.maxt; + id[offset] = ray.hitId; + } + } +} diff --git a/examples/rt/sponza.bvh b/examples/rt/sponza.bvh new file mode 100644 index 00000000..e59bde24 Binary files /dev/null and b/examples/rt/sponza.bvh differ diff --git a/examples/rt/sponza.camera b/examples/rt/sponza.camera new file mode 100644 index 00000000..7d44ec23 Binary files /dev/null and b/examples/rt/sponza.camera differ diff --git a/examples/rt/teapot.bvh b/examples/rt/teapot.bvh new file mode 100644 index 00000000..efcd7807 Binary files /dev/null and b/examples/rt/teapot.bvh differ diff --git a/examples/rt/teapot.camera b/examples/rt/teapot.camera new file mode 100644 index 00000000..9a98e3f6 Binary files /dev/null and b/examples/rt/teapot.camera differ diff --git a/examples/simple/Makefile b/examples/simple/Makefile new file mode 100644 index 00000000..b00c6737 --- /dev/null +++ b/examples/simple/Makefile @@ -0,0 +1,25 @@ + +CXX=g++ +CXXFLAGS=-Iobjs/ -O3 -Wall +ISPC=ispc +ISPCFLAGS=-O2 + +default: simple + +.PHONY: dirs clean +.PRECIOUS: objs/simple.h + +dirs: + /bin/mkdir -p objs/ + +clean: + /bin/rm -rf objs *~ simple + +simple: dirs objs/simple.o objs/simple_ispc.o + $(CXX) $(CXXFLAGS) -o $@ objs/simple.o objs/simple_ispc.o + +objs/simple.o: simple.cpp objs/simple_ispc.h + $(CXX) $(CXXFLAGS) -c -o $@ $< + +objs/%_ispc.h objs/%_ispc.o: %.ispc + $(ISPC) $(ISPCFLAGS) $< -o objs/$*_ispc.o -h objs/$*_ispc.h diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp new file mode 100644 index 00000000..3b5bf028 --- /dev/null +++ b/examples/simple/simple.cpp @@ -0,0 +1,63 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include + +// Include the header file that the ispc compiler generates +#include "simple_ispc.h" +using namespace ispc; + +int main() { + // Pointers passed to ispc-compiled code are currently required to have + // alignment equal to the target's native vector size. Here we align + // to 32 bytes to be safe for both SSE and AVX targets. +#ifdef _MSC_VER + __declspec(align(32)) float vin[16], vout[16]; +#else + float vin[16] __attribute__((aligned(32))); + float vout[16] __attribute__((aligned(32))); +#endif + + // Initialize input buffer + for (int i = 0; i < 16; ++i) + vin[i] = (float)i; + + // Call simple() function from simple.ispc file + simple(vin, vout, 16); + + // Print results + for (int i = 0; i < 16; ++i) + printf("%d: simple(%f) = %f\n", i, vin[i], vout[i]); + + return 0; +} diff --git a/examples/simple/simple.ispc b/examples/simple/simple.ispc new file mode 100644 index 00000000..a44c29e5 --- /dev/null +++ b/examples/simple/simple.ispc @@ -0,0 +1,53 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + + +export void simple(uniform float vin[], uniform float vout[], + uniform int count) { + // Compute the result for 'programCount' values in parallel + for (uniform int i = 0; i < count; i += programCount) { + int index = i + programIndex; + // Load the appropriate input value for this program instance. + float v = vin[index]; + + // Do an arbitrary little computation, but at least make the + // computation dependent on the value being processed + if (v < 3.) + v = v * v; + else + v = sqrt(v); + + // And write the result to the output array. + vout[index] = v; + } +} diff --git a/examples/simple/simple.vcxproj b/examples/simple/simple.vcxproj new file mode 100755 index 00000000..9723ed02 --- /dev/null +++ b/examples/simple/simple.vcxproj @@ -0,0 +1,164 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + Document + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + +cl /E /TP %(Filename).ispc | ispc -O2 -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 + + +cl /E /TP %(Filename).ispc | ispc -O2 - -o %(Filename).obj -h %(Filename)_ispc.h + + %(Filename).obj + %(Filename).obj + + + + {947C5311-8B78-4D05-BEE4-BCF342D4B367} + Win32Proj + simple + + + + Application + true + Unicode + + + Application + true + Unicode + + + Application + false + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + + + Level3 + Disabled + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + Level3 + + + MaxSpeed + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + + + Console + true + true + true + + + + + + diff --git a/examples/timing.h b/examples/timing.h new file mode 100644 index 00000000..a51ab372 --- /dev/null +++ b/examples/timing.h @@ -0,0 +1,67 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include + + +#ifdef WIN32 +#include +#define rdtsc __rdtsc +#else +extern "C" { + __inline__ uint64_t rdtsc() { + uint32_t low, high; + __asm__ __volatile__ ( + "xorl %%eax,%%eax \n cpuid" + ::: "%rax", "%rbx", "%rcx", "%rdx" ); + __asm__ __volatile__ ( + "rdtsc" : "=a" (low), "=d" (high)); + return (uint64_t)high << 32 | low; + } +} +#endif + +static uint64_t start, end; + +static inline void reset_and_start_timer() +{ + start = rdtsc(); +} + +/* Returns the number of millions of elapsed processor cycles since the + last reset_and_start_timer() call. */ +static inline double get_elapsed_mcycles() +{ + end = rdtsc(); + return (end-start) / (1024. * 1024.); +} diff --git a/expr.cpp b/expr.cpp new file mode 100644 index 00000000..53b07bdf --- /dev/null +++ b/expr.cpp @@ -0,0 +1,4519 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file expr.cpp + @brief Implementations of expression classes +*/ + +#include "expr.h" +#include "type.h" +#include "sym.h" +#include "ctx.h" +#include "module.h" +#include "util.h" +#include "llvmutil.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +///////////////////////////////////////////////////////////////////////////////////// +// Expr + +llvm::Value * +Expr::GetLValue(FunctionEmitContext *ctx) const { + // Expressions that can't provide an lvalue can just return NULL + return NULL; +} + + +llvm::Constant * +Expr::GetConstant(const Type *type) const { + // The default is failure; just return NULL + return NULL; +} + + +Symbol * +Expr::GetBaseSymbol() const { + // Not all expressions can do this, so provide a generally-useful + // default + return NULL; +} + + +/** If a conversion from 'fromAtomicType' to 'toAtomicType' may cause lost + precision, issue a warning. Don't warn for conversions to bool and + conversions between signed and unsigned integers of the same size. + */ +static void +lMaybeIssuePrecisionWarning(const AtomicType *toAtomicType, + const AtomicType *fromAtomicType, + SourcePos pos, const char *errorMsgBase) { + switch (toAtomicType->basicType) { + case AtomicType::TYPE_BOOL: + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_UINT32: + case AtomicType::TYPE_FLOAT: + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + case AtomicType::TYPE_DOUBLE: + if ((int)toAtomicType->basicType < (int)fromAtomicType->basicType && + toAtomicType->basicType != AtomicType::TYPE_BOOL && + !(toAtomicType->basicType == AtomicType::TYPE_INT32 && + fromAtomicType->basicType == AtomicType::TYPE_UINT32) && + !(toAtomicType->basicType == AtomicType::TYPE_INT64 && + fromAtomicType->basicType == AtomicType::TYPE_UINT64)) + Warning(pos, "Conversion from type \"%s\" to type \"%s\" for %s" + " may lose information.", + fromAtomicType->GetString().c_str(), toAtomicType->GetString().c_str(), + errorMsgBase); + break; + default: + FATAL("logic error in lMaybeIssuePrecisionWarning"); + } +} + + +Expr * +Expr::TypeConv(const Type *toType, const char *errorMsgBase, bool failureOk) { + /* This function is way too long and complex. Is type conversion stuff + always this messy, or can this be cleaned up somehow? */ + assert(failureOk || errorMsgBase != NULL); + + const Type *fromType = GetType(); + if (toType == NULL || fromType == NULL) + return this; + + // The types are equal; there's nothing to do + if (Type::Equal(toType, fromType)) + return this; + + if (fromType == AtomicType::Void) { + if (!failureOk) + Error(pos, "Can't convert from \"void\" to \"%s\" for %s.", + toType->GetString().c_str(), errorMsgBase); + return NULL; + } + + if (toType == AtomicType::Void) { + if (!failureOk) + Error(pos, "Can't convert type \"%s\" to \"void\" for %s.", + fromType->GetString().c_str(), errorMsgBase); + return NULL; + } + + if (toType->IsUniformType() && fromType->IsVaryingType()) { + if (!failureOk) + Error(pos, "Can't convert from varying type \"%s\" to uniform " + "type \"%s\" for %s.", fromType->GetString().c_str(), + toType->GetString().c_str(), errorMsgBase); + return NULL; + } + + // Convert from type T -> const T; just return a TypeCast expr, which + // can handle this + if (Type::Equal(toType, fromType->GetAsConstType())) + return new TypeCastExpr(toType, this, pos); + + if (dynamic_cast(fromType)) { + if (dynamic_cast(toType)) { + // Convert from a reference to a type to a const reference to a type; + // this is handled by TypeCastExpr + if (Type::Equal(toType->GetReferenceTarget(), + fromType->GetReferenceTarget()->GetAsConstType())) + return new TypeCastExpr(toType, this, pos); +#if 0 + // FIXME: why is this commented out?? + else { + Error(pos, "Can't convert between incompatible reference types \"%s\" " + "and \"%s\".", fromType->GetString().c_str(), + toType->GetString().c_str()); + return NULL; + } +#endif + } + else { + // convert from a reference T -> T + Expr *fromExpr = new DereferenceExpr(this, pos); + if (fromExpr->GetType() == NULL) + return NULL; + return fromExpr->TypeConv(toType, errorMsgBase, failureOk); + } + } + else if (dynamic_cast(toType)) { + // T -> reference T + Expr *fromExpr = new ReferenceExpr(this, pos); + if (fromExpr->GetType() == NULL) + return NULL; + return fromExpr->TypeConv(toType, errorMsgBase, failureOk); + } + else if (Type::Equal(toType, fromType->GetAsNonConstType())) + // convert: const T -> T (as long as T isn't a reference) + return new TypeCastExpr(toType, this, pos); + + fromType = fromType->GetReferenceTarget(); + toType = toType->GetReferenceTarget(); + // I don't think this is necessary +//CO if (Type::Equal(toType, fromType)) +//CO return fromExpr; + + const ArrayType *toArrayType = dynamic_cast(toType); + const ArrayType *fromArrayType = dynamic_cast(fromType); + if (toArrayType && fromArrayType) { + if (Type::Equal(toArrayType->GetElementType(), fromArrayType->GetElementType())) { + // the case of different element counts should have returned + // out earlier, yes?? + assert(toArrayType->GetElementCount() != fromArrayType->GetElementCount()); + return new TypeCastExpr(new ReferenceType(toType, false), this, pos); + } + else if (Type::Equal(toArrayType->GetElementType(), + fromArrayType->GetElementType()->GetAsConstType())) { + // T[x] -> const T[x] + return new TypeCastExpr(new ReferenceType(toType, false), this, pos); + } + else { + if (!failureOk) + Error(pos, "Array type \"%s\" can't be converted to type \"%s\" for %s.", + fromType->GetString().c_str(), toType->GetString().c_str(), + errorMsgBase); + return NULL; + } + } + + const VectorType *toVectorType = dynamic_cast(toType); + const VectorType *fromVectorType = dynamic_cast(fromType); + if (toVectorType && fromVectorType) { + // converting e.g. int -> float + if (fromVectorType->GetElementCount() != toVectorType->GetElementCount()) { + if (!failureOk) + Error(pos, "Can't convert between differently sized vector types " + "\"%s\" -> \"%s\" for %s.", fromType->GetString().c_str(), + toType->GetString().c_str(), errorMsgBase); + return NULL; + } + return new TypeCastExpr(toType, this, pos); + } + + const StructType *toStructType = dynamic_cast(toType); + const StructType *fromStructType = dynamic_cast(fromType); + if (toStructType && fromStructType) { + if (!Type::Equal(toStructType->GetAsUniformType()->GetAsConstType(), + fromStructType->GetAsUniformType()->GetAsConstType())) { + if (!failureOk) + Error(pos, "Can't convert between different struct types " + "\"%s\" -> \"%s\".", fromStructType->GetString().c_str(), + toStructType->GetString().c_str()); + return NULL; + } + + return new TypeCastExpr(toType, this, pos); + } + + // from here on out, the from type can only be atomic something or + // other... + const AtomicType *fromAtomicType = dynamic_cast(fromType); + if (fromAtomicType == NULL) { + if (!failureOk) + Error(pos, "Type conversion only possible from atomic types, not " + "from \"%s\" to \"%s\", for %s.", fromType->GetString().c_str(), + toType->GetString().c_str(), errorMsgBase); + return NULL; + } + + // scalar -> short-vector conversions + if (toVectorType != NULL) + return new TypeCastExpr(toType, this, pos); + + // ok, it better be a scalar->scalar conversion of some sort by now + const AtomicType *toAtomicType = dynamic_cast(toType); + if (toAtomicType == NULL) { + if (!failureOk) + Error(pos, "Type conversion only possible to atomic types, not " + "from \"%s\" to \"%s\", for %s.", + fromType->GetString().c_str(), toType->GetString().c_str(), + errorMsgBase); + return NULL; + } + + if (!failureOk) + lMaybeIssuePrecisionWarning(toAtomicType, fromAtomicType, pos, + errorMsgBase); + + return new TypeCastExpr(toType, this, pos); +} + + +/////////////////////////////////////////////////////////////////////////// + +/** Given an atomic or vector type, this returns a boolean type with the + same "shape". In other words, if the given type is a vector type of + three uniform ints, the returned type is a vector type of three uniform + bools. */ +static const Type * +lMatchingBoolType(const Type *type) { + bool uniformTest = type->IsUniformType(); + const AtomicType *boolBase = uniformTest ? AtomicType::UniformBool : + AtomicType::VaryingBool; + const VectorType *vt = dynamic_cast(type); + if (vt != NULL) + return new VectorType(boolBase, vt->GetElementCount()); + else { + assert(dynamic_cast(type) != NULL); + return boolBase; + } +} + +/////////////////////////////////////////////////////////////////////////// +// UnaryExpr + +static llvm::Constant * +lLLVMConstantValue(const Type *type, llvm::LLVMContext *ctx, double value) { + const AtomicType *atomicType = dynamic_cast(type); + const VectorType *vectorType = dynamic_cast(type); + + // This function is only called with, and only works for atomic and + // vector types. + assert(atomicType != NULL || vectorType != NULL); + + if (atomicType) { + // If it's an atomic type, then figure out which of the llvmutil.h + // functions to call to get the corresponding constant and then + // call it... + bool isUniform = type->IsUniformType(); + switch (atomicType->basicType) { + case AtomicType::TYPE_VOID: + FATAL("can't get constant value for void type"); + return NULL; + case AtomicType::TYPE_BOOL: + if (isUniform) + return (value != 0.) ? LLVMTrue : LLVMFalse; + else + return LLVMBoolVector(value != 0.); + case AtomicType::TYPE_UINT32: { + unsigned int i = (unsigned int)value; + return isUniform ? LLVMUInt32(i) : LLVMUInt32Vector(i); + } + case AtomicType::TYPE_INT32: { + int i = (int)value; + assert((double)i == value); + return isUniform ? LLVMInt32(i) : LLVMInt32Vector(i); + } + case AtomicType::TYPE_FLOAT: + return isUniform ? LLVMFloat((float)value) : + LLVMFloatVector((float)value); + case AtomicType::TYPE_UINT64: { + uint64_t i = (uint64_t)value; + assert(value == (int64_t)i); + return isUniform ? LLVMUInt64(i) : LLVMUInt64Vector(i); + } + case AtomicType::TYPE_INT64: { + int64_t i = (int64_t)value; + assert((double)i == value); + return isUniform ? LLVMInt64(i) : LLVMInt64Vector(i); + } + case AtomicType::TYPE_DOUBLE: + return isUniform ? LLVMDouble(value) : LLVMDoubleVector(value); + default: + FATAL("logic error in lLLVMConstantValue"); + return NULL; + } + } + + // For vector types, first get the LLVM constant for the basetype with + // a recursive call to lLLVMConstantValue(). + const Type *baseType = vectorType->GetBaseType(); + llvm::Constant *constElement = lLLVMConstantValue(baseType, ctx, value); + const llvm::Type *llvmVectorType = vectorType->LLVMType(ctx); + + // Now create a constant version of the corresponding LLVM type that we + // use to represent the VectorType. + // FIXME: this is a little ugly in that the fact that ispc represents + // uniform VectorTypes as LLVM VectorTypes and varying VectorTypes as + // LLVM ArrayTypes leaks into the code here; it feels like this detail + // should be better encapsulated? + if (baseType->IsUniformType()) { + const llvm::VectorType *lvt = + llvm::dyn_cast(llvmVectorType); + assert(lvt != NULL); + std::vector vals; + for (unsigned int i = 0; i < lvt->getNumElements(); ++i) + vals.push_back(constElement); + return llvm::ConstantVector::get(lvt, vals); + } + else { + const llvm::ArrayType *lat = + llvm::dyn_cast(llvmVectorType); + assert(lat != NULL); + std::vector vals; + for (unsigned int i = 0; i < lat->getNumElements(); ++i) + vals.push_back(constElement); + return llvm::ConstantArray::get(lat, vals); + } +} + + +/** Utility routine to emit code to do a {pre,post}-{inc,dec}rement of the + given expresion. + */ +static llvm::Value * +lEmitPrePostIncDec(UnaryExpr::Op op, Expr *expr, SourcePos pos, + FunctionEmitContext *ctx) { + const Type *type = expr->GetType(); + + // Get both the lvalue and the rvalue of the given expression + llvm::Value *lvalue = NULL, *rvalue = NULL; + if (dynamic_cast(type) != NULL) { + type = type->GetReferenceTarget(); + lvalue = expr->GetValue(ctx); + + Expr *deref = new DereferenceExpr(expr, expr->pos); + rvalue = deref->GetValue(ctx); + } + else { + lvalue = expr->GetLValue(ctx); + rvalue = expr->GetValue(ctx); + } + + if (lvalue == NULL) { + // If we can't get a lvalue, then we have an error here + Error(expr->pos, "Can't %s-%s non-lvalues.", + (op == UnaryExpr::PreInc || op == UnaryExpr::PreDec) ? "pre" : "post", + (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? "increment" : "decrement"); + return NULL; + } + + // Emit code to do the appropriate addition/subtraction to the + // expression's old value + ctx->SetDebugPos(pos); + llvm::Value *binop = NULL; + int delta = (op == UnaryExpr::PreInc || op == UnaryExpr::PostInc) ? 1 : -1; + llvm::Constant *dval = lLLVMConstantValue(type, g->ctx, delta); + if (!type->IsFloatType()) + binop = ctx->BinaryOperator(llvm::Instruction::Add, rvalue, + dval, "val_inc_or_dec"); + else + binop = ctx->BinaryOperator(llvm::Instruction::FAdd, rvalue, + dval, "val_inc_or_dec"); + +#if 0 + if (type->IsUniformType()) { + if (ctx->VaryingCFDepth() > 0) + Warning(expr->pos, + "Modifying \"uniform\" value under \"varying\" control flow. Beware."); + } +#endif + + // And store the result out to the lvalue + ctx->StoreInst(binop, lvalue, ctx->GetMask(), type); + + // And then if it's a pre increment/decrement, return the final + // computed result; otherwise return the previously-grabbed expression + // value. + return (op == UnaryExpr::PreInc || op == UnaryExpr::PreDec) ? binop : rvalue; +} + + + +/** Utility routine to emit code to negate the given expression. + */ +static llvm::Value * +lEmitNegate(Expr *arg, SourcePos pos, FunctionEmitContext *ctx) { + const Type *type = arg->GetType(); + llvm::Value *argVal = arg->GetValue(ctx); + if (type == NULL || argVal == NULL) + return NULL; + + // Negate by subtracting from zero... + llvm::Value *zero = lLLVMConstantValue(type, g->ctx, 0.); + ctx->SetDebugPos(pos); + if (type->IsFloatType()) + return ctx->BinaryOperator(llvm::Instruction::FSub, zero, argVal, "fnegate"); + else { + assert(type->IsIntType()); + return ctx->BinaryOperator(llvm::Instruction::Sub, zero, argVal, "fnegate"); + } +} + + +UnaryExpr::UnaryExpr(Op o, Expr *e, SourcePos p) + : Expr(p), op(o) { + expr = e; +} + + +llvm::Value * +UnaryExpr::GetValue(FunctionEmitContext *ctx) const { + if (expr == NULL) + return NULL; + + ctx->SetDebugPos(pos); + + switch (op) { + case PreInc: + case PreDec: + case PostInc: + case PostDec: + return lEmitPrePostIncDec(op, expr, pos, ctx); + case Negate: + return lEmitNegate(expr, pos, ctx); + case LogicalNot: { + llvm::Value *argVal = expr->GetValue(ctx); + return ctx->NotOperator(argVal, "logicalnot"); + } + case BitNot: { + llvm::Value *argVal = expr->GetValue(ctx); + return ctx->NotOperator(argVal, "bitnot"); + } + default: + FATAL("logic error"); + return NULL; + } +} + + +const Type * +UnaryExpr::GetType() const { + if (expr == NULL) + return NULL; + + const Type *type = expr->GetType(); + if (type == NULL) + return NULL; + + // For all unary expressions besides logical not, the returned type is + // the same as the source type. Logical not always returns a bool + // type, with the same shape as the input type. + switch (op) { + case PreInc: + case PreDec: + case PostInc: + case PostDec: + case Negate: + case BitNot: + return type; + case LogicalNot: + return lMatchingBoolType(type); + default: + FATAL("error"); + return NULL; + } +} + + +Expr * +UnaryExpr::Optimize() { + if (!expr) + return NULL; + + expr = expr->Optimize(); + + ConstExpr *constExpr = dynamic_cast(expr); + // If the operand isn't a constant, then we can't do any optimization + // here... + if (constExpr == NULL) + return this; + + const Type *type = constExpr->GetType(); + + if (type == AtomicType::UniformInt64 || + type == AtomicType::VaryingInt64 || + type == AtomicType::UniformUInt64 || + type == AtomicType::VaryingUInt64 || + type == AtomicType::UniformConstInt64 || + type == AtomicType::VaryingConstInt64 || + type == AtomicType::UniformConstUInt64 || + type == AtomicType::VaryingConstUInt64) + // FIXME: should handle these at some point; for now we only do + // constant folding for bool, int32 and float types... + return this; + + switch (op) { + case PreInc: + case PreDec: + case PostInc: + case PostDec: + // this shouldn't happen--it's illegal to modify a contant value.. + // An error will be issued elsewhere... + return this; + case Negate: { + // Since we currently only handle int32 and floats here, it's safe + // to stuff whatever we have into a double, do the negate as a + // double, and then return a ConstExpr with the same type as the + // original... + double v[ISPC_MAX_NVEC]; + int count = constExpr->AsDouble(v); + for (int i = 0; i < count; ++i) + v[i] = -v[i]; + return new ConstExpr(constExpr, v); + } + case BitNot: { + if (type == AtomicType::UniformInt32 || + type == AtomicType::VaryingInt32 || + type == AtomicType::UniformConstInt32 || + type == AtomicType::VaryingConstInt32) { + int32_t v[ISPC_MAX_NVEC]; + int count = constExpr->AsInt32(v); + for (int i = 0; i < count; ++i) + v[i] = ~v[i]; + return new ConstExpr(type, v, pos); + } + else if (type == AtomicType::UniformUInt32 || + type == AtomicType::VaryingUInt32 || + type == AtomicType::UniformConstUInt32 || + type == AtomicType::VaryingConstUInt32) { + uint32_t v[ISPC_MAX_NVEC]; + int count = constExpr->AsUInt32(v); + for (int i = 0; i < count; ++i) + v[i] = ~v[i]; + return new ConstExpr(type, v, pos); + } + else + FATAL("unexpected type in UnaryExpr::Optimize() / BitNot case"); + } + case LogicalNot: { + assert(type == AtomicType::UniformBool || + type == AtomicType::VaryingBool || + type == AtomicType::UniformConstBool || + type == AtomicType::VaryingConstBool); + bool v[ISPC_MAX_NVEC]; + int count = constExpr->AsBool(v); + for (int i = 0; i < count; ++i) + v[i] = !v[i]; + return new ConstExpr(type, v, pos); + } + default: + FATAL("unexpected op in UnaryExpr::Optimize()"); + return NULL; + } +} + + +Expr * +UnaryExpr::TypeCheck() { + if (expr != NULL) + expr = expr->TypeCheck(); + if (expr == NULL) + // something went wrong in type checking... + return NULL; + + const Type *type = expr->GetType(); + if (type == NULL) + return NULL; + + if (op == PreInc || op == PreDec || op == PostInc || op == PostDec) { + if (!type->IsNumericType()) { + Error(expr->pos, "Can only pre/post increment float and integer " + "types, not \"%s\".", type->GetString().c_str()); + return NULL; + } + return this; + } + + // don't do this for pre/post increment/decrement + if (dynamic_cast(type)) { + expr = new DereferenceExpr(expr, pos); + type = expr->GetType(); + } + + if (op == Negate) { + if (!type->IsNumericType()) { + Error(expr->pos, "Negate not allowed for non-numeric type \"%s\".", + type->GetString().c_str()); + return NULL; + } + } + else if (op == LogicalNot) { + const Type *boolType = lMatchingBoolType(type); + expr = expr->TypeConv(boolType, "logical not"); + if (!expr) + return NULL; + } + else if (op == BitNot) { + if (!type->IsIntType()) { + Error(expr->pos, "~ operator can only be used with integer types, " + "not \"%s\".", type->GetString().c_str()); + return NULL; + } + } + return this; +} + + +void +UnaryExpr::Print() const { + if (!expr || !GetType()) + return; + + printf("[ %s ] (", GetType()->GetString().c_str()); + if (op == PreInc) printf("++"); + if (op == PreDec) printf("--"); + if (op == Negate) printf("-"); + if (op == LogicalNot) printf("!"); + if (op == BitNot) printf("~"); + printf("("); + expr->Print(); + printf(")"); + if (op == PostInc) printf("++"); + if (op == PostDec) printf("--"); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// BinaryExpr + +static const char * +lOpString(BinaryExpr::Op op) { + switch (op) { + case BinaryExpr::Add: return "+"; + case BinaryExpr::Sub: return "-"; + case BinaryExpr::Mul: return "*"; + case BinaryExpr::Div: return "/"; + case BinaryExpr::Mod: return "%"; + case BinaryExpr::Shl: return "<<"; + case BinaryExpr::Shr: return ">>"; + case BinaryExpr::Lt: return "<"; + case BinaryExpr::Gt: return ">"; + case BinaryExpr::Le: return "<="; + case BinaryExpr::Ge: return ">="; + case BinaryExpr::Equal: return "=="; + case BinaryExpr::NotEqual: return "!="; + case BinaryExpr::BitAnd: return "&"; + case BinaryExpr::BitXor: return "^"; + case BinaryExpr::BitOr: return "|"; + case BinaryExpr::LogicalAnd: return "&&"; + case BinaryExpr::LogicalOr: return "||"; + case BinaryExpr::Comma: return ","; + default: + FATAL("unimplemented case in lOpString()"); + return ""; + } +} + + +/** Utility routine to emit the binary bitwise operator corresponding to + the given BinaryExpr::Op. +*/ +static llvm::Value * +lEmitBinaryBitOp(BinaryExpr::Op op, llvm::Value *arg0Val, + llvm::Value *arg1Val, FunctionEmitContext *ctx) { + llvm::Instruction::BinaryOps inst; + switch (op) { + case BinaryExpr::Shl: inst = llvm::Instruction::Shl; break; + case BinaryExpr::Shr: inst = llvm::Instruction::AShr; break; + case BinaryExpr::BitAnd: inst = llvm::Instruction::And; break; + case BinaryExpr::BitXor: inst = llvm::Instruction::Xor; break; + case BinaryExpr::BitOr: inst = llvm::Instruction::Or; break; + default: + FATAL("logic error in lEmitBinaryBitOp()"); + return NULL; + } + + return ctx->BinaryOperator(inst, arg0Val, arg1Val, "bitop"); +} + + +/** Utility routine to emit binary arithmetic operator based on the given + BinaryExpr::Op. +*/ +static llvm::Value * +lEmitBinaryArith(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val, + const Type *type, FunctionEmitContext *ctx, SourcePos pos) { + llvm::Instruction::BinaryOps inst; + bool isFloatOp = type->IsFloatType(); + bool isUnsignedOp = type->IsUnsignedType(); + + switch (op) { + case BinaryExpr::Add: + inst = isFloatOp ? llvm::Instruction::FAdd : llvm::Instruction::Add; + break; + case BinaryExpr::Sub: + inst = isFloatOp ? llvm::Instruction::FSub : llvm::Instruction::Sub; + break; + case BinaryExpr::Mul: + inst = isFloatOp ? llvm::Instruction::FMul : llvm::Instruction::Mul; + break; + case BinaryExpr::Div: + if (type->IsVaryingType() && !isFloatOp) + PerformanceWarning(pos, "Division with varying integer types is " + "very inefficient."); + inst = isFloatOp ? llvm::Instruction::FDiv : + (isUnsignedOp ? llvm::Instruction::UDiv : llvm::Instruction::SDiv); + break; + case BinaryExpr::Mod: + if (type->IsVaryingType() && !isFloatOp) + PerformanceWarning(pos, "Modulus operator with varying types is " + "very inefficient."); + inst = isFloatOp ? llvm::Instruction::FRem : + (isUnsignedOp ? llvm::Instruction::URem : llvm::Instruction::SRem); + break; + default: + FATAL("Invalid op type passed to lEmitBinaryArith()"); + return NULL; + } + + return ctx->BinaryOperator(inst, e0Val, e1Val, "binop"); +} + + +/** Utility routine to emit a binary comparison operator based on the given + BinaryExpr::Op. + */ +static llvm::Value * +lEmitBinaryCmp(BinaryExpr::Op op, llvm::Value *e0Val, llvm::Value *e1Val, + const Type *type, FunctionEmitContext *ctx, SourcePos pos) { + bool isFloatOp = type->IsFloatType(); + bool isUnsignedOp = type->IsUnsignedType(); + + llvm::CmpInst::Predicate pred; + switch (op) { + case BinaryExpr::Lt: + pred = isFloatOp ? llvm::CmpInst::FCMP_OLT : + (isUnsignedOp ? llvm::CmpInst::ICMP_ULT : llvm::CmpInst::ICMP_SLT); + break; + case BinaryExpr::Gt: + pred = isFloatOp ? llvm::CmpInst::FCMP_OGT : + (isUnsignedOp ? llvm::CmpInst::ICMP_UGT : llvm::CmpInst::ICMP_SGT); + break; + case BinaryExpr::Le: + pred = isFloatOp ? llvm::CmpInst::FCMP_OLE : + (isUnsignedOp ? llvm::CmpInst::ICMP_ULE : llvm::CmpInst::ICMP_SLE); + break; + case BinaryExpr::Ge: + pred = isFloatOp ? llvm::CmpInst::FCMP_OGE : + (isUnsignedOp ? llvm::CmpInst::ICMP_UGE : llvm::CmpInst::ICMP_SGE); + break; + case BinaryExpr::Equal: + pred = isFloatOp ? llvm::CmpInst::FCMP_OEQ : llvm::CmpInst::ICMP_EQ; + break; + case BinaryExpr::NotEqual: + pred = isFloatOp ? llvm::CmpInst::FCMP_ONE : llvm::CmpInst::ICMP_NE; + break; + default: + FATAL("error in lEmitBinaryCmp()"); + return NULL; + } + + llvm::Value *cmp = ctx->CmpInst(isFloatOp ? llvm::Instruction::FCmp : + llvm::Instruction::ICmp, + pred, e0Val, e1Val, "bincmp"); + // This is a little ugly: CmpInst returns i1 values, but we use vectors + // of i32s for varying bool values; type convert the result here if + // needed. + if (type->IsVaryingType()) + cmp = ctx->I1VecToBoolVec(cmp); + + return cmp; +} + + +BinaryExpr::BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p) + : Expr(p), op(o) { + arg0 = a; + arg1 = b; +} + + +llvm::Value * +BinaryExpr::GetValue(FunctionEmitContext *ctx) const { + if (!arg0 || !arg1) + return NULL; + + llvm::Value *e0Val = arg0->GetValue(ctx); + llvm::Value *e1Val = arg1->GetValue(ctx); + ctx->SetDebugPos(pos); + + switch (op) { + case Add: + case Sub: + case Mul: + case Div: + case Mod: + return lEmitBinaryArith(op, e0Val, e1Val, arg0->GetType(), ctx, pos); + case Lt: + case Gt: + case Le: + case Ge: + case Equal: + case NotEqual: + return lEmitBinaryCmp(op, e0Val, e1Val, arg0->GetType(), ctx, pos); + case Shl: + case Shr: + case BitAnd: + case BitXor: + case BitOr: { + if (op == Shr && arg1->GetType()->IsVaryingType() && + dynamic_cast(arg1) == NULL) + PerformanceWarning(pos, "Shift right is extremely inefficient for " + "varying shift amounts."); + return lEmitBinaryBitOp(op, e0Val, e1Val, ctx); + } + case LogicalAnd: + return ctx->BinaryOperator(llvm::Instruction::And, e0Val, e1Val, + "logical_and"); + case LogicalOr: + return ctx->BinaryOperator(llvm::Instruction::Or, e0Val, e1Val, + "logical_or"); + case Comma: + return e1Val; + default: + FATAL("logic error"); + return NULL; + } +} + + +const Type * +BinaryExpr::GetType() const { + if (arg0 == NULL || arg1 == NULL) + return NULL; + + const Type *type0 = arg0->GetType(), *type1 = arg1->GetType(); + if (type0 == NULL || type1 == NULL) + return NULL; + + if (!type0->IsBoolType() && !type0->IsNumericType()) { + Error(arg0->pos, "First operand to binary operator \"%s\" is of invalid " + "type \"%s\".", lOpString(op), type0->GetString().c_str()); + return NULL; + } + if (!type1->IsBoolType() && !type1->IsNumericType()) { + Error(arg1->pos, + "Second operand to binary operator \"%s\" is of invalid " + "type \"%s\".", lOpString(op), type1->GetString().c_str()); + return NULL; + } + + const Type *promotedType = Type::MoreGeneralType(type0, type1, pos, + lOpString(op)); + // I don't think that MoreGeneralType should be able to fail after the + // type checks above. + assert(promotedType != NULL); + + switch (op) { + case Add: + case Sub: + case Mul: + case Div: + case Mod: + return promotedType; + case Lt: + case Gt: + case Le: + case Ge: + case Equal: + case NotEqual: + case LogicalAnd: + case LogicalOr: + return lMatchingBoolType(promotedType); + case Shl: + case Shr: + case BitAnd: + case BitXor: + case BitOr: + return promotedType; + case Comma: + return arg1->GetType(); + default: + FATAL("logic error in BinaryExpr::GetType()"); + return NULL; + } +} + + +#define FOLD_OP(O, E) \ + case O: \ + for (int i = 0; i < count; ++i) \ + result[i] = (v0[i] E v1[i]); \ + break + +/** Constant fold the binary integer operations that aren't also applicable + to floating-point types. +*/ +template static ConstExpr * +lConstFoldBinIntOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) { + T result[ISPC_MAX_NVEC]; + int count = carg0->Count(); + + switch (op) { + FOLD_OP(BinaryExpr::Mod, %); + FOLD_OP(BinaryExpr::Shl, <<); + FOLD_OP(BinaryExpr::Shr, >>); + FOLD_OP(BinaryExpr::BitAnd, &); + FOLD_OP(BinaryExpr::BitXor, ^); + FOLD_OP(BinaryExpr::BitOr, |); + default: + return NULL; + } + + return new ConstExpr(carg0->GetType(), result, carg0->pos); +} + + +/** Constant fold the binary logical ops. + */ +template static ConstExpr * +lConstFoldBinLogicalOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) { + bool result[ISPC_MAX_NVEC]; + int count = carg0->Count(); + + switch (op) { + FOLD_OP(BinaryExpr::Lt, <); + FOLD_OP(BinaryExpr::Gt, >); + FOLD_OP(BinaryExpr::Le, <=); + FOLD_OP(BinaryExpr::Ge, >=); + FOLD_OP(BinaryExpr::Equal, ==); + FOLD_OP(BinaryExpr::NotEqual, !=); + FOLD_OP(BinaryExpr::LogicalAnd, &&); + FOLD_OP(BinaryExpr::LogicalOr, ||); + default: + return NULL; + } + + const Type *rType = carg0->GetType()->IsUniformType() ? AtomicType::UniformBool : + AtomicType::VaryingBool; + return new ConstExpr(rType, result, carg0->pos); +} + + +/** Constant fold binary arithmetic ops. + */ +template static ConstExpr * +lConstFoldBinArithOp(BinaryExpr::Op op, const T *v0, const T *v1, ConstExpr *carg0) { + T result[ISPC_MAX_NVEC]; + int count = carg0->Count(); + + switch (op) { + FOLD_OP(BinaryExpr::Add, +); + FOLD_OP(BinaryExpr::Sub, -); + FOLD_OP(BinaryExpr::Mul, *); + FOLD_OP(BinaryExpr::Div, /); + default: + return NULL; + } + + return new ConstExpr(carg0->GetType(), result, carg0->pos); +} + + +/** Constant fold the various boolean binary ops. + */ +static ConstExpr * +lConstFoldBoolBinOp(BinaryExpr::Op op, const bool *v0, const bool *v1, + ConstExpr *carg0) { + bool result[ISPC_MAX_NVEC]; + int count = carg0->Count(); + + switch (op) { + FOLD_OP(BinaryExpr::BitAnd, &); + FOLD_OP(BinaryExpr::BitXor, ^); + FOLD_OP(BinaryExpr::BitOr, |); + FOLD_OP(BinaryExpr::Lt, <); + FOLD_OP(BinaryExpr::Gt, >); + FOLD_OP(BinaryExpr::Le, <=); + FOLD_OP(BinaryExpr::Ge, >=); + FOLD_OP(BinaryExpr::Equal, ==); + FOLD_OP(BinaryExpr::NotEqual, !=); + FOLD_OP(BinaryExpr::LogicalAnd, &&); + FOLD_OP(BinaryExpr::LogicalOr, ||); + default: + return NULL; + } + + return new ConstExpr(carg0->GetType(), result, carg0->pos); +} + + +Expr * +BinaryExpr::Optimize() { + if (arg0 != NULL) + arg0 = arg0->Optimize(); + if (arg1 != NULL) + arg1 = arg1->Optimize(); + + if (!arg0 || !arg1) + return NULL; + + ConstExpr *constArg0 = dynamic_cast(arg0); + ConstExpr *constArg1 = dynamic_cast(arg1); + + if (g->opt.fastMath) { + // optimizations related to division by floats.. + + // transform x / const -> x * (1/const) + if (op == Div && constArg1 != NULL) { + const Type *type1 = constArg1->GetType(); + if (Type::Equal(type1, AtomicType::UniformFloat) || + Type::Equal(type1, AtomicType::VaryingFloat) || + Type::Equal(type1, AtomicType::UniformConstFloat) || + Type::Equal(type1, AtomicType::VaryingConstFloat)) { + float inv[ISPC_MAX_NVEC]; + int count = constArg1->AsFloat(inv); + for (int i = 0; i < count; ++i) + inv[i] = 1.f / inv[i]; + Expr *einv = new ConstExpr(type1, inv, constArg1->pos); + Expr *e = new BinaryExpr(Mul, arg0, einv, pos); + e = e->TypeCheck(); + if (e == NULL) + return NULL; + return e->Optimize(); + } + } + + // transform x / y -> x * rcp(y) + if (op == Div) { + const Type *type1 = arg1->GetType(); + if (Type::Equal(type1, AtomicType::UniformFloat) || + Type::Equal(type1, AtomicType::VaryingFloat) || + Type::Equal(type1, AtomicType::UniformConstFloat) || + Type::Equal(type1, AtomicType::VaryingConstFloat)) { + // Get the symbol for the appropriate builtin + std::vector *rcpFuns = + m->symbolTable->LookupFunction("rcp"); + if (rcpFuns != NULL) { + assert(rcpFuns->size() == 2); + Expr *rcpSymExpr = new FunctionSymbolExpr(rcpFuns, pos); + ExprList *args = new ExprList(arg1, arg1->pos); + Expr *rcpCall = new FunctionCallExpr(rcpSymExpr, args, + arg1->pos, false); + rcpCall = rcpCall->TypeCheck(); + if (rcpCall == NULL) + return NULL; + rcpCall = rcpCall->Optimize(); + if (rcpCall == NULL) + return NULL; + + Expr *ret = new BinaryExpr(Mul, arg0, rcpCall, pos); + ret = ret->TypeCheck(); + if (ret == NULL) + return NULL; + return ret->Optimize(); + } + else + Warning(pos, "rcp() not found from stdlib. Can't apply " + "fast-math rcp optimization."); + } + } + } + + // From here on out, we're just doing constant folding, so if both args + // aren't constants then we're done... + if (constArg0 == NULL || constArg1 == NULL) + return this; + + assert(Type::Equal(arg0->GetType()->GetAsNonConstType(), + arg1->GetType()->GetAsNonConstType())); + const Type *type = arg0->GetType()->GetAsNonConstType(); + if (type == AtomicType::UniformFloat || type == AtomicType::VaryingFloat) { + float v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC]; + constArg0->AsFloat(v0); + constArg1->AsFloat(v1); + ConstExpr *ret; + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) + return ret; + else + return this; + } + if (type == AtomicType::UniformDouble || type == AtomicType::VaryingDouble) { + double v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC]; + constArg0->AsDouble(v0); + constArg1->AsDouble(v1); + ConstExpr *ret; + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) + return ret; + else + return this; + } + if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) { + int32_t v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC]; + constArg0->AsInt32(v0); + constArg1->AsInt32(v1); + ConstExpr *ret; + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) + return ret; + else + return this; + } + else if (type == AtomicType::UniformUInt32 || type == AtomicType::VaryingUInt32) { + uint32_t v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC]; + constArg0->AsUInt32(v0); + constArg1->AsUInt32(v1); + ConstExpr *ret; + if ((ret = lConstFoldBinArithOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinIntOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) + return ret; + else + return this; + } + else if (type == AtomicType::UniformBool || type == AtomicType::VaryingBool) { + bool v0[ISPC_MAX_NVEC], v1[ISPC_MAX_NVEC]; + constArg0->AsBool(v0); + constArg1->AsBool(v1); + ConstExpr *ret; + if ((ret = lConstFoldBoolBinOp(op, v0, v1, constArg0)) != NULL) + return ret; + else if ((ret = lConstFoldBinLogicalOp(op, v0, v1, constArg0)) != NULL) + return ret; + else + return this; + } + else + return this; +} + + +Expr * +BinaryExpr::TypeCheck() { + if (arg0 != NULL) + arg0 = arg0->TypeCheck(); + if (arg1 != NULL) + arg1 = arg1->TypeCheck(); + + if (arg0 == NULL || arg1 == NULL) + return NULL; + + const Type *type0 = arg0->GetType(), *type1 = arg1->GetType(); + if (type0 == NULL || type1 == NULL) + return NULL; + + switch (op) { + case Shl: + case Shr: + case BitAnd: + case BitXor: + case BitOr: { + // Must have integer or bool-typed operands for these bit-related + // ops; don't do any implicit conversions from floats here... + if (!type0->IsIntType() && !type0->IsBoolType()) { + Error(arg0->pos, "First operand to binary operator \"%s\" must be " + "an integer or bool.", lOpString(op)); + return NULL; + } + if (!type1->IsIntType() && !type1->IsBoolType()) { + Error(arg1->pos, "Second operand to binary operator \"%s\" must be " + "an integer or bool.", lOpString(op)); + return NULL; + } + + const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos, + "binary bit op"); + if (promotedType == NULL) + return NULL; + + arg0 = arg0->TypeConv(promotedType, "binary bit op"); + arg1 = arg1->TypeConv(promotedType, "binary bit op"); + if (arg0 == NULL || arg1 == NULL) + return NULL; + return this; + } + case Add: + case Sub: + case Mul: + case Div: + case Mod: + case Lt: + case Gt: + case Le: + case Ge: { + // Must be numeric type for these. (And mod is special--can't be float) + if (!type0->IsNumericType() || (op == Mod && type0->IsFloatType())) { + Error(arg0->pos, "First operand to binary operator \"%s\" is of " + "invalid type \"%s\".", lOpString(op), + type0->GetString().c_str()); + return NULL; + } + if (!type1->IsNumericType() || (op == Mod && type1->IsFloatType())) { + Error(arg1->pos, "First operand to binary operator \"%s\" is of " + "invalid type \"%s\".", lOpString(op), + type1->GetString().c_str()); + return NULL; + } + + const Type *promotedType = Type::MoreGeneralType(type0, type1, arg0->pos, + lOpString(op)); + if (promotedType == NULL) + return NULL; + + arg0 = arg0->TypeConv(promotedType, lOpString(op)); + arg1 = arg1->TypeConv(promotedType, lOpString(op)); + if (!arg0 || !arg1) + return NULL; + return this; + } + case Equal: + case NotEqual: { + if (!type0->IsBoolType() && !type0->IsNumericType()) { + Error(arg0->pos, + "First operand to equality operator \"%s\" is of " + "non-comparable type \"%s\".", lOpString(op), + type0->GetString().c_str()); + return NULL; + } + if (!type1->IsBoolType() && !type1->IsNumericType()) { + Error(arg1->pos, + "Second operand to equality operator \"%s\" is of " + "non-comparable type \"%s\".", lOpString(op), + type1->GetString().c_str()); + return NULL; + } + + const Type *promotedType = + Type::MoreGeneralType(type0, type1, arg0->pos, lOpString(op)); + if (promotedType == NULL) + return NULL; + + arg0 = arg0->TypeConv(promotedType, lOpString(op)); + arg1 = arg1->TypeConv(promotedType, lOpString(op)); + if (!arg0 || !arg1) + return NULL; + return this; + } + case LogicalAnd: + case LogicalOr: { + // We need to type convert to a boolean type of the more general + // shape of the two types + bool isUniform = (type0->IsUniformType() && type1->IsUniformType()); + const AtomicType *boolType = isUniform ? AtomicType::UniformBool : + AtomicType::VaryingBool; + const Type *destType = NULL; + const VectorType *vtype0 = dynamic_cast(type0); + const VectorType *vtype1 = dynamic_cast(type1); + if (vtype0 && vtype1) { + int sz0 = vtype0->GetElementCount(), sz1 = vtype1->GetElementCount(); + if (sz0 != sz1) { + Error(pos, "Can't do logical operation \"%s\" between vector types of " + "different sizes (%d vs. %d).", lOpString(op), sz0, sz1); + return NULL; + } + destType = new VectorType(boolType, sz0); + } + else if (vtype0) + destType = new VectorType(boolType, vtype0->GetElementCount()); + else if (vtype1) + destType = new VectorType(boolType, vtype1->GetElementCount()); + else + destType = boolType; + + arg0 = arg0->TypeConv(destType, lOpString(op)); + arg1 = arg1->TypeConv(destType, lOpString(op)); + if (!arg0 || !arg1) + return NULL; + return this; + } + case Comma: + return this; + default: + FATAL("logic error"); + return NULL; + } +} + + +void +BinaryExpr::Print() const { + if (!arg0 || !arg1 || !GetType()) + return; + + printf("[ %s ] (", GetType()->GetString().c_str()); + arg0->Print(); + printf(" %s ", lOpString(op)); + arg1->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// AssignExpr + + +/** Store the result of an assignment to the given location. + */ +static void +lStoreAssignResult(llvm::Value *rv, llvm::Value *lv, const Type *type, + FunctionEmitContext *ctx, Symbol *baseSym) { + assert(baseSym->varyingCFDepth <= ctx->VaryingCFDepth()); + if (!g->opt.disableMaskedStoreToStore && + baseSym->varyingCFDepth == ctx->VaryingCFDepth() && + baseSym->isStatic == false && + dynamic_cast(baseSym->type) == NULL) { + // If the variable is declared at the same varying control flow + // depth as where it's being assigned, then we don't need to do any + // masking but can just do the assignment as if all the lanes were + // known to be on. While this may lead to random/garbage values + // written into the lanes that are off, by definition they will + // never be accessed, since those lanes aren't executing, and won't + // be executing at this scope or any other one before the variable + // goes out of scope. + ctx->StoreInst(rv, lv, LLVMMaskAllOn, type); + } + else + ctx->StoreInst(rv, lv, ctx->GetMask(), type); +} + + +/** Emit code to do an "assignment + operation" operator, e.g. "+=". + */ +static llvm::Value * +lEmitOpAssign(AssignExpr::Op op, Expr *arg0, Expr *arg1, const Type *type, + Symbol *baseSym, SourcePos pos, FunctionEmitContext *ctx) { + llvm::Value *lv = arg0->GetLValue(ctx); + if (!lv) { + // FIXME: I think this test is unnecessary and that this case + // should be caught during typechecking + Error(pos, "Can't assign to left-hand side of expression."); + return NULL; + } + + // Get the value on the right-hand side of the assignment+operation + // operator and load the current value on the left-hand side. + llvm::Value *rvalue = arg1->GetValue(ctx); + ctx->SetDebugPos(pos); + llvm::Value *oldLHS = ctx->LoadInst(lv, type, "opassign_load"); + + // Map the operator to the corresponding BinaryExpr::Op operator + BinaryExpr::Op basicop; + switch (op) { + case AssignExpr::MulAssign: basicop = BinaryExpr::Mul; break; + case AssignExpr::DivAssign: basicop = BinaryExpr::Div; break; + case AssignExpr::ModAssign: basicop = BinaryExpr::Mod; break; + case AssignExpr::AddAssign: basicop = BinaryExpr::Add; break; + case AssignExpr::SubAssign: basicop = BinaryExpr::Sub; break; + case AssignExpr::ShlAssign: basicop = BinaryExpr::Shl; break; + case AssignExpr::ShrAssign: basicop = BinaryExpr::Shr; break; + case AssignExpr::AndAssign: basicop = BinaryExpr::BitAnd; break; + case AssignExpr::XorAssign: basicop = BinaryExpr::BitXor; break; + case AssignExpr::OrAssign: basicop = BinaryExpr::BitOr; break; + default: + FATAL("logic error in lEmitOpAssign()"); + return NULL; + } + + // Emit the code to compute the new value + llvm::Value *newValue = NULL; + switch (op) { + case AssignExpr::MulAssign: + case AssignExpr::DivAssign: + case AssignExpr::ModAssign: + case AssignExpr::AddAssign: + case AssignExpr::SubAssign: + newValue = lEmitBinaryArith(basicop, oldLHS, rvalue, type, ctx, pos); + break; + case AssignExpr::ShlAssign: + case AssignExpr::ShrAssign: + case AssignExpr::AndAssign: + case AssignExpr::XorAssign: + case AssignExpr::OrAssign: + newValue = lEmitBinaryBitOp(basicop, oldLHS, rvalue, ctx); + break; + default: + FATAL("logic error in lEmitOpAssign"); + return NULL; + } + + // And store the result back to the lvalue. + lStoreAssignResult(newValue, lv, type, ctx, baseSym); + + return newValue; +} + + +AssignExpr::AssignExpr(AssignExpr::Op o, Expr *a, Expr *b, SourcePos p) + : Expr(p), op(o) { + lvalue = a; + rvalue = b; +} + + +llvm::Value * +AssignExpr::GetValue(FunctionEmitContext *ctx) const { + const Type *type = NULL; + if (lvalue == NULL || rvalue == NULL || (type = GetType()) == NULL) + return NULL; + + ctx->SetDebugPos(pos); + +#if 0 + if (ctx->VaryingCFDepth() > 0 && type->IsUniformType()) + Warning(pos, "Modifying \"uniform\" value under \"varying\" control flow. Beware."); +#endif + + Symbol *baseSym = lvalue->GetBaseSymbol(); + if (!baseSym) { + // FIXME: I think that this check also is unnecessary and that this + // case should be covered during type checking. + Error(pos, "Left hand side of assignment statement can't be assigned to."); + return NULL; + } + + switch (op) { + case Assign: { + llvm::Value *lv = lvalue->GetLValue(ctx); + if (!lv) { + // FIXME: another, I believe, now unnecessary test? + Error(lvalue->pos, "Can't assign to left-hand side of expression."); + return NULL; + } + + llvm::Value *rv = rvalue->GetValue(ctx); + if (rv == NULL) + return NULL; + + ctx->SetDebugPos(pos); + + // Warn if we're assigning a large array + const ArrayType *at = dynamic_cast(type); + if (at && at->TotalElementCount() > 4) + PerformanceWarning(pos, "Copying %d element array in assignment expression.", + at->TotalElementCount()); + +#if 0 + const StructType *st = dynamic_cast(type); + if (st != NULL) { + bool anyUniform = false; + for (int i = 0; i < st->NumElements(); ++i) { + if (st->GetMemberType(i)->IsUniformType()) + anyUniform = true; + } + + if (anyUniform && ctx->VaryingCFDepth() > 0) + Warning(pos, "Modifying \"uniform\" value under \"varying\" " + "control flow. Beware."); + } +#endif + + lStoreAssignResult(rv, lv, type, ctx, baseSym); + + return rv; + } + case MulAssign: + case DivAssign: + case ModAssign: + case AddAssign: + case SubAssign: + case ShlAssign: + case ShrAssign: + case AndAssign: + case XorAssign: + case OrAssign: { + // This should be caught during type checking + assert(!dynamic_cast(type) && + !dynamic_cast(type)); + return lEmitOpAssign(op, lvalue, rvalue, type, baseSym, pos, ctx); + } + default: + FATAL("logic error in AssignExpr::GetValue()"); + return NULL; + } +} + + +Expr * +AssignExpr::Optimize() { + if (lvalue) + lvalue = lvalue->Optimize(); + if (rvalue) + rvalue = rvalue->Optimize(); + if (lvalue == NULL || rvalue == NULL) + return NULL; + + return this; +} + + +const Type * +AssignExpr::GetType() const { + return lvalue ? lvalue->GetType() : NULL; +} + + +Expr * +AssignExpr::TypeCheck() { + bool lvalueIsReference = lvalue && + dynamic_cast(lvalue->GetType()) != NULL; + bool rvalueIsReference = rvalue && + dynamic_cast(rvalue->GetType()) != NULL; + + // hack to allow asigning array references e.g. in a struct... + if (lvalueIsReference && + !(rvalueIsReference && + dynamic_cast(rvalue->GetType()->GetReferenceTarget()))) + lvalue = new DereferenceExpr(lvalue, lvalue->pos); + + if (lvalue != NULL) + lvalue = lvalue->TypeCheck(); + if (rvalue != NULL) + rvalue = rvalue->TypeCheck(); + if (rvalue != NULL && lvalue != NULL) + rvalue = rvalue->TypeConv(lvalue->GetType(), "operator ="); + if (rvalue == NULL || lvalue == NULL) + return NULL; + + if (lvalue->GetType()->IsConstType()) { + Error(pos, "Can't assign to type \"%s\" on left-hand size of " + "expression.", lvalue->GetType()->GetString().c_str()); + return NULL; + } + + return this; +} + + +void +AssignExpr::Print() const { + if (!lvalue || !rvalue || !GetType()) + return; + + printf("[%s] assign (", GetType()->GetString().c_str()); + lvalue->Print(); + printf(" "); + if (op == Assign) printf("="); + if (op == MulAssign) printf("*="); + if (op == DivAssign) printf("/="); + if (op == ModAssign) printf("%%="); + if (op == AddAssign) printf("+="); + if (op == SubAssign) printf("-="); + if (op == ShlAssign) printf("<<="); + if (op == ShrAssign) printf(">>="); + if (op == AndAssign) printf("&="); + if (op == XorAssign) printf("^="); + if (op == OrAssign) printf("|="); + printf(" "); + rvalue->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// SelectExpr + +SelectExpr::SelectExpr(Expr *t, Expr *e1, Expr *e2, SourcePos p) + : Expr(p) { + test = t; + expr1 = e1; + expr2 = e2; +} + + +/** Emit code to select between two varying values based on a varying test + value. + */ +static llvm::Value * +lEmitVaryingSelect(FunctionEmitContext *ctx, llvm::Value *test, + llvm::Value *expr1, llvm::Value *expr2, + const Type *type) { + llvm::Value *resultPtr = ctx->AllocaInst(expr1->getType(), "selectexpr_tmp"); + // Don't need to worry about masking here + ctx->StoreInst(expr2, resultPtr); + // Use masking to conditionally store the expr1 values + ctx->StoreInst(expr1, resultPtr, test, type); + return ctx->LoadInst(resultPtr, type, "selectexpr_final"); +} + + +llvm::Value * +SelectExpr::GetValue(FunctionEmitContext *ctx) const { + if (!expr1 || !expr2 || !test) + return NULL; + + ctx->SetDebugPos(pos); + + const Type *testType = test->GetType()->GetAsNonConstType(); + // This should be taken care of during typechecking + assert(testType->GetBaseType() == AtomicType::UniformBool || + testType->GetBaseType() == AtomicType::VaryingBool); + + const Type *type = expr1->GetType(); + // Type checking should also make sure this is the case + assert(Type::Equal(type->GetAsNonConstType(), + expr2->GetType()->GetAsNonConstType())); + + if (testType == AtomicType::UniformBool) { + // Simple case of a single uniform bool test expression; we just + // want one of the two expressions. In this case, we can be + // careful to evaluate just the one of the expressions that we need + // the value of so that if the other one has side-effects or + // accesses invalid memory, it doesn't execute. + llvm::Value *testVal = test->GetValue(ctx); + llvm::BasicBlock *testTrue = ctx->CreateBasicBlock("select_true"); + llvm::BasicBlock *testFalse = ctx->CreateBasicBlock("select_false"); + llvm::BasicBlock *testDone = ctx->CreateBasicBlock("select_done"); + ctx->BranchInst(testTrue, testFalse, testVal); + + ctx->SetCurrentBasicBlock(testTrue); + llvm::Value *expr1Val = expr1->GetValue(ctx); + // Note that truePred won't be necessarily equal to testTrue, in + // case the expr1->GetValue() call changes the current basic block. + llvm::BasicBlock *truePred = ctx->GetCurrentBasicBlock(); + ctx->BranchInst(testDone); + + ctx->SetCurrentBasicBlock(testFalse); + llvm::Value *expr2Val = expr2->GetValue(ctx); + // See comment above truePred for why we can't just assume we're in + // the testFalse basic block here. + llvm::BasicBlock *falsePred = ctx->GetCurrentBasicBlock(); + ctx->BranchInst(testDone); + + ctx->SetCurrentBasicBlock(testDone); + llvm::PHINode *ret = ctx->PhiNode(expr1Val->getType(), 2, "select"); + ret->addIncoming(expr1Val, truePred); + ret->addIncoming(expr2Val, falsePred); + return ret; + } + else if (dynamic_cast(testType) == NULL) { + // if the test is a varying bool type, then evaluate both of the + // value expressions with the mask set appropriately and then do an + // element-wise select to get the result + llvm::Value *testVal = test->GetValue(ctx); + assert(testVal->getType() == LLVMTypes::MaskType); + llvm::Value *oldMask = ctx->GetMask(); + ctx->MaskAnd(oldMask, testVal); + llvm::Value *expr1Val = expr1->GetValue(ctx); + ctx->MaskAndNot(oldMask, testVal); + llvm::Value *expr2Val = expr2->GetValue(ctx); + ctx->SetMask(oldMask); + + return lEmitVaryingSelect(ctx, testVal, expr1Val, expr2Val, type); + } + else { + // FIXME? Short-circuiting doesn't work in the case of + // vector-valued test expressions. (We could also just prohibit + // these and place the issue in the user's hands...) + llvm::Value *testVal = test->GetValue(ctx); + llvm::Value *expr1Val = expr1->GetValue(ctx); + llvm::Value *expr2Val = expr2->GetValue(ctx); + + ctx->SetDebugPos(pos); + const VectorType *vt = dynamic_cast(type); + // Things that typechecking should have caught + assert(vt != NULL); + assert(dynamic_cast(testType) != NULL && + (dynamic_cast(testType)->GetElementCount() == + vt->GetElementCount())); + + // Do an element-wise select + llvm::Value *result = llvm::UndefValue::get(type->LLVMType(g->ctx)); + for (int i = 0; i < vt->GetElementCount(); ++i) { + llvm::Value *ti = ctx->ExtractInst(testVal, i, ""); + llvm::Value *e1i = ctx->ExtractInst(expr1Val, i, ""); + llvm::Value *e2i = ctx->ExtractInst(expr2Val, i, ""); + llvm::Value *sel = NULL; + if (testType->IsUniformType()) + sel = ctx->SelectInst(ti, e1i, e2i); + else + sel = lEmitVaryingSelect(ctx, ti, e1i, e2i, vt->GetElementType()); + result = ctx->InsertInst(result, sel, i, ""); + } + return result; + } +} + + +const Type * +SelectExpr::GetType() const { + if (!test || !expr1 || !expr2) + return NULL; + + const Type *testType = test->GetType(); + const Type *expr1Type = expr1->GetType(); + const Type *expr2Type = expr2->GetType(); + + if (!testType || !expr1Type || !expr2Type) + return NULL; + + bool becomesVarying = (testType->IsVaryingType() || expr1Type->IsVaryingType() || + expr2Type->IsVaryingType()); + // if expr1 and expr2 have different vector sizes, typechecking should fail... + int testVecSize = dynamic_cast(testType) != NULL ? + dynamic_cast(testType)->GetElementCount() : 0; + int expr1VecSize = dynamic_cast(expr1Type) != NULL ? + dynamic_cast(expr1Type)->GetElementCount() : 0; +//CO int expr2VecSize = dynamic_cast(expr2Type) != NULL ? +//CO dynamic_cast(expr2Type)->GetElementCount() : 0; +//CO assert(testVecSize == expr1VecSize && expr1VecSize == expr2VecSize); + // REMOVE? old test + assert(!(testVecSize != 0 && expr1VecSize != 0 && testVecSize != expr1VecSize)); + + int vectorSize = std::max(testVecSize, expr1VecSize); + return Type::MoreGeneralType(expr1Type, expr2Type, pos, "select expression", + becomesVarying, vectorSize); +} + + +Expr * +SelectExpr::Optimize() { + if (test) + test = test->Optimize(); + if (expr1) + expr1 = expr1->Optimize(); + if (expr2) + expr2 = expr2->Optimize(); + if (test == NULL || expr1 == NULL || expr2 == NULL) + return NULL; + + return this; +} + + +Expr * +SelectExpr::TypeCheck() { + if (test) + test = test->TypeCheck(); + if (expr1) + expr1 = expr1->TypeCheck(); + if (expr2) + expr2 = expr2->TypeCheck(); + + if (test == NULL || expr1 == NULL || expr2 == NULL) + return NULL; + + const Type *type1 = expr1->GetType(), *type2 = expr2->GetType(); + if (!type1 || !type2) + return NULL; + + if (dynamic_cast(type1)) { + Error(pos, "Array type \"%s\" can't be used in select expression", + type1->GetString().c_str()); + return NULL; + } + if (dynamic_cast(type2)) { + Error(pos, "Array type \"%s\" can't be used in select expression", + type2->GetString().c_str()); + return NULL; + } + + const Type *testType = test->GetType(); + if (testType == NULL) + return NULL; + test = test->TypeConv(lMatchingBoolType(testType), "select"); + if (testType == NULL) + return NULL; + testType = test->GetType(); + + int testVecSize = dynamic_cast(testType) ? + dynamic_cast(testType)->GetElementCount() : 0; + const Type *promotedType = Type::MoreGeneralType(type1, type2, pos, "select expression", + testType->IsVaryingType(), testVecSize); + if (promotedType == NULL) + return NULL; + + expr1 = expr1->TypeConv(promotedType, "select"); + expr2 = expr2->TypeConv(promotedType, "select"); + if (!expr1 || !expr2) + return NULL; + + return this; +} + + +void +SelectExpr::Print() const { + if (!test || !expr1 || !expr2 || !GetType()) + return; + + printf("[%s] (", GetType()->GetString().c_str()); + test->Print(); + printf(" ? "); + expr1->Print(); + printf(" : "); + expr2->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// FunctionCallExpr + +static void +lPrintFunctionOverloads(const std::vector &matches) { + for (unsigned int i = 0; i < matches.size(); ++i) { + const FunctionType *t = dynamic_cast(matches[i]->type); + assert(t != NULL); + fprintf(stderr, "\t%s\n", t->GetString().c_str()); + } +} + + +/** Helper function used for function overload resolution: returns true if + the call argument's type exactly matches the function argument type + (modulo a conversion to a const type if needed). + */ +static bool +lExactMatch(Expr *callArg, const Type *funcArgType) { + const Type *callType = callArg->GetType(); + if (dynamic_cast(callType) == NULL) + callType = callType->GetAsNonConstType(); + + return Type::Equal(callType, funcArgType); +} + +/** Helper function used for function overload resolution: returns true if + the call argument type and the function argument type match, modulo + conversion to a reference type if needed. + */ +static bool +lMatchIgnoringReferences(Expr *callArg, const Type *funcArgType) { + const Type *callType = callArg->GetType()->GetReferenceTarget(); + if (funcArgType->IsConstType()) + callType = callType->GetAsConstType(); + + return Type::Equal(callType, + funcArgType->GetReferenceTarget()); +} + + +/** Helper function used for function overload resolution: returns true if + the call argument type and the function argument type match if we only + do a uniform -> varying type conversion but otherwise have exactly the + same type. + */ +static bool +lMatchIgnoringUniform(Expr *callArg, const Type *funcArgType) { + const Type *callType = callArg->GetType(); + if (dynamic_cast(callType) == NULL) + callType = callType->GetAsNonConstType(); + + if (Type::Equal(callType, funcArgType)) + return true; + + return (callType->IsUniformType() && + funcArgType->IsVaryingType() && + Type::Equal(callType->GetAsVaryingType(), funcArgType)); +} + + +/** Helper function used for function overload resolution: returns true if + we can type convert from the call argument type to the function + argument type, but without doing a uniform -> varying conversion. + */ +static bool +lMatchWithTypeConvSameVariability(Expr *callArg, const Type *funcArgType) { + Expr *te = callArg->TypeConv(funcArgType, + "function call argument", true); + return (te != NULL && + te->GetType()->IsUniformType() == callArg->GetType()->IsUniformType()); +} + + +/** Helper function used for function overload resolution: returns true if + there is any type conversino that gets us from the caller argument type + to the function argument type. + */ +static bool +lMatchWithTypeConv(Expr *callArg, const Type *funcArgType) { + Expr *te = callArg->TypeConv(funcArgType, + "function call argument", true); + return (te != NULL); +} + + +/** See if we can find a single function from the set of overload options + based on the predicate function passed in. Returns true if no more + tries should be made to find a match, either due to success from + finding a single overloaded function that matches or failure due to + finding multiple ambiguous matches. + */ +bool +FunctionCallExpr::tryResolve(bool (*matchFunc)(Expr *, const Type *)) { + FunctionSymbolExpr *fse = dynamic_cast(func); + if (!fse) + // error will be issued later if not calling an actual function + return false; + + const char *funName = fse->candidateFunctions->front()->name.c_str(); + std::vector &callArgs = args->exprs; + + std::vector matches; + std::vector::iterator iter; + for (iter = fse->candidateFunctions->begin(); + iter != fse->candidateFunctions->end(); ++iter) { + // Loop over the set of candidate functions and try each one + Symbol *candidateFunction = *iter; + const FunctionType *ft = + dynamic_cast(candidateFunction->type); + assert(ft != NULL); + const std::vector &candArgTypes = ft->GetArgumentTypes(); + const std::vector &argumentDefaults = ft->GetArgumentDefaults(); + + // There's no way to match if the caller is passing more arguments + // than this function instance takes. + if (callArgs.size() > candArgTypes.size()) + continue; + + unsigned int i; + // Note that we're looping over the caller arguments, not the + // function arguments; it may be ok to have more arguments to the + // function than are passed, if the function has default argument + // values. This case is handled below. + for (i = 0; i < callArgs.size(); ++i) { + // This may happen if there's an error earlier in compilation. + // It's kind of a silly to redundantly discover this for each + // potential match versus detecting this earlier in the + // matching process and just giving up. + if (!callArgs[i] || !callArgs[i]->GetType() || !candArgTypes[i]) + return false; + + // See if this caller argument matches the type of the + // corresponding function argument according to the given + // predicate function. If not, break out and stop trying. + if (!matchFunc(callArgs[i], candArgTypes[i])) + break; + } + if (i == callArgs.size()) { + // All of the arguments matched! + if (i == candArgTypes.size()) + // And we have exactly as many arguments as the function + // wants, so we're done. + matches.push_back(candidateFunction); + else if (i < candArgTypes.size() && argumentDefaults[i] != NULL) + // Otherwise we can still make it if there are default + // arguments for the rest of the arguments! Because in + // Module::AddFunction() we have verified that once the + // default arguments start, then all of the following ones + // have them as well. Therefore, we just need to check if + // the arg we stopped at has a default value and we're + // done. + matches.push_back(candidateFunction); + // otherwise, we don't have a match + } + } + + if (matches.size() == 0) + return false; + else if (matches.size() == 1) { + fse->matchingFunc = matches[0]; + + // fill in any function defaults required + const FunctionType *ft = + dynamic_cast(fse->matchingFunc->type); + assert(ft != NULL); + const std::vector &argumentDefaults = ft->GetArgumentDefaults(); + const std::vector &argTypes = ft->GetArgumentTypes(); + for (unsigned int i = callArgs.size(); i < argTypes.size(); ++i) { + assert(argumentDefaults[i] != NULL); + args->exprs.push_back(argumentDefaults[i]); + } + return true; + } + else { + Error(fse->pos, "Multiple overloaded instances of function \"%s\" matched.", + funName); + lPrintFunctionOverloads(matches); + // Stop trying to find more matches after failure + return true; + } +} + + +void +FunctionCallExpr::resolveFunctionOverloads() { + FunctionSymbolExpr *fse = dynamic_cast(func); + if (!fse) + // error will be issued later if not calling an actual function + return; + assert(args); + + // Try to find the best overload for the function... + + // Is there an exact match that doesn't require any argument type + // conversion at all? + if (tryResolve(lExactMatch)) + return; + + // Try to find a single match ignoring references + if (tryResolve(lMatchIgnoringReferences)) + return; + + // TODO: next, try to find an exact match via type promotion--i.e. char + // -> int, etc--things that don't lose data + + // Next try to see if there's a match via just uniform -> varying + // promotions. TODO: look for one with a minimal number of them? + if (tryResolve(lMatchIgnoringUniform)) + return; + + // Try to find a match via type conversion, but don't change + // unif->varying + if (tryResolve(lMatchWithTypeConvSameVariability)) + return; + + // Last chance: try to find a match via arbitrary type conversion. + if (tryResolve(lMatchWithTypeConv)) + return; + + // failure :-( + const char *funName = fse->candidateFunctions->front()->name.c_str(); + Error(pos, "Unable to find matching overload for call to function \"%s\". " + "Candidates are:", funName); + lPrintFunctionOverloads(*fse->candidateFunctions); + fprintf(stderr, "Passed types: %s(", funName); + for (unsigned int i = 0; i < args->exprs.size(); ++i) { + const Type *t = args->exprs[i]->GetType(); + if (t) + fprintf(stderr, "%s%s", t->GetString().c_str(), + (i < args->exprs.size()-1) ? ", " : ")\n"); + else + fprintf(stderr, "(unknown type)%s", + (i < args->exprs.size()-1) ? ", " : ")\n"); + } +} + + +FunctionCallExpr::FunctionCallExpr(Expr *f, ExprList *a, SourcePos p, bool il) + : Expr(p) { + func = f; + args = a; + isLaunch = il; + + resolveFunctionOverloads(); +} + + +/** Starting from the function initialFunction, we're calling into + calledFunc. The question is: is this a recursive call back to + initialFunc? If it definitely is or if it may be, then return true. + Return false if it definitely is not. + */ +static bool +lMayBeRecursiveCall(llvm::Function *calledFunc, + llvm::Function *initialFunc, + std::set &seenFuncs) { + // Easy case: intrinsics aren't going to call functions themselves + if (calledFunc->isIntrinsic()) + return false; + + std::string name = calledFunc->getName(); + if (name.size() > 2 && name[0] == '_' && name[1] == '_') + // builtin stdlib function; none of these are recursive... + return false; + + if (calledFunc->isDeclaration()) + // There's visibility into what the called function does without a + // definition, so we have to be conservative + return true; + + if (calledFunc == initialFunc) + // hello recursive call + return true; + + // Otherwise iterate over all of the instructions in the function. If + // any of them is a function call then check recursively.. + llvm::inst_iterator iter; + for (iter = llvm::inst_begin(calledFunc); + iter != llvm::inst_end(calledFunc); ++iter) { + llvm::Instruction *inst = &*iter; + llvm::CallInst *ci = llvm::dyn_cast(inst); + if (ci != NULL) { + llvm::Function *nextCalledFunc = ci->getCalledFunction(); + // Don't repeatedly test functions we've seen before + if (seenFuncs.find(nextCalledFunc) == seenFuncs.end()) { + seenFuncs.insert(nextCalledFunc); + if (lMayBeRecursiveCall(nextCalledFunc, initialFunc, + seenFuncs)) + return true; + } + } + } + return false; +} + + +llvm::Value * +FunctionCallExpr::GetValue(FunctionEmitContext *ctx) const { + if (!func || !args) + return NULL; + + ctx->SetDebugPos(pos); + + FunctionSymbolExpr *fse = dynamic_cast(func); + if (!fse) { + Error(pos, "Invalid function name for function call."); + return NULL; + } + + if (!fse->matchingFunc) + // no overload match was found, get out of here.. + return NULL; + + Symbol *funSym = fse->matchingFunc; + llvm::Function *callee = funSym->function; + if (!callee) { + Error(pos, "Symbol \"%s\" is not a function.", funSym->name.c_str()); + return NULL; + } + + const FunctionType *ft = dynamic_cast(funSym->type); + assert(ft != NULL); + bool isVoidFunc = (ft->GetReturnType() == AtomicType::Void); + + // Automatically convert function call args to references if needed. + // FIXME: this should move to the TypeCheck() method... (but the + // GetLValue call below needs a FunctionEmitContext, which is + // problematic...) + std::vector callargs = args->exprs; + const std::vector &argTypes = ft->GetArgumentTypes(); + bool err = false; + for (unsigned int i = 0; i < callargs.size(); ++i) { + Expr *argExpr = callargs[i]; + if (!argExpr) + continue; + + // All arrays should already have been converted to reference types + assert(dynamic_cast(argTypes[i]) == NULL); + + if (dynamic_cast(argTypes[i])) { + if (!dynamic_cast(argExpr->GetType())) { + // The function wants a reference type but the argument + // being passed isn't already a reference. + if (argExpr->GetLValue(ctx) == NULL) { + // If it doesn't have an lvalue, then we can't make it + // a reference, so issue an error. + // FIXME: for const reference parameters, we could + // store the expr's value to alloca'ed memory and then + // pass a reference to that... + Error(pos, "Can't pass non-lvalue as \"reference\" parameter \"%s\" " + "to function \"%s\".", ft->GetArgumentName(i).c_str(), + funSym->name.c_str()); + err = true; + } + else + argExpr = new ReferenceExpr(argExpr, argExpr->pos); + } + } + + // Do whatever type conversion is needed + argExpr = argExpr->TypeConv(argTypes[i], "function call argument"); + // The function overload resolution code should have ensured that + // we can successfully do any type conversions needed here. + assert(argExpr != NULL); + callargs[i] = argExpr; + } + if (err) + return NULL; + + // Now evaluate the values of all of the parameters being passed. We + // need to evaluate these first here, since their GetValue() calls may + // change the current basic block (e.g. if one of these is itself a + // function call expr...); we need to basic blocks to stay consistent + // below when we emit the code that does the actual funciton call. + std::vector argVals; + std::vector storedArgValPtrs, argValLValues; + for (unsigned int i = 0; i < callargs.size(); ++i) { + Expr *argExpr = callargs[i]; + if (!argExpr) + // give up; we hit an error earlier + return NULL; + + llvm::Value *argValue = argExpr->GetValue(ctx); + if (!argValue) + // something went wrong in evaluating the argument's + // expression, so give up on this + return NULL; + + if (dynamic_cast(argTypes[i]) && + !llvm::isa(argValue->getType())) { + assert(llvm::isa(argValue->getType())); + // if the parameter is a reference and the lvalue needs a + // gather to pull it together, then do the gather here and + // store the result to local memory, so that we can pass the + // single pointer to the local memory that is needed for the + // reference. Below, we'll copy the result back to the varying + // lvalue pointer we have here. (== pass by value/result) + const ReferenceType *rt = + dynamic_cast(argExpr->GetType()); + assert(rt != NULL); + const Type *type = rt->GetReferenceTarget(); + + llvm::Value *ptr = ctx->AllocaInst(type->LLVMType(g->ctx), "arg"); + llvm::Value *val = ctx->LoadInst(argValue, type); + ctx->StoreInst(val, ptr); + storedArgValPtrs.push_back(ptr); + argValLValues.push_back(argValue); + argVals.push_back(ptr); + } + else { + argVals.push_back(argValue); + storedArgValPtrs.push_back(NULL); + argValLValues.push_back(NULL); + } + } + + // We sometimes need to check to see if the mask is all off here; + // specifically, if the mask is all off and we call a recursive + // function, then we will probably have an unsesirable infinite loop. + ctx->SetDebugPos(pos); + llvm::BasicBlock *bDoCall = ctx->CreateBasicBlock("funcall_mask_ok"); + llvm::BasicBlock *bSkip = ctx->CreateBasicBlock("funcall_mask_off"); + llvm::BasicBlock *bAfter = ctx->CreateBasicBlock("after_funcall"); + llvm::Function *currentFunc = ctx->GetCurrentBasicBlock()->getParent(); + + // If we need to check the mask (it may be a recursive call, possibly + // transitively), or we're launching a task, which is expensive and + // thus probably always worth checking, then use the mask to choose + // whether to go to the bDoCallBlock or the bSkip block + std::set seenFuncs; + seenFuncs.insert(currentFunc); + if (ft->isTask || lMayBeRecursiveCall(callee, currentFunc, seenFuncs)) { + Debug(pos, "Checking mask before function call \"%s\".", funSym->name.c_str()); + ctx->BranchIfMaskAny(bDoCall, bSkip); + } + else + // If we don't need to check the mask, then always to the call; + // just jump to bDoCall + ctx->BranchInst(bDoCall); + + // And the bSkip block just jumps immediately to bAfter. So why do we + // need it? So the phi node below can easily tell what paths are + // going into it + ctx->SetCurrentBasicBlock(bSkip); + ctx->BranchInst(bAfter); + + // Emit the code to do the function call + ctx->SetCurrentBasicBlock(bDoCall); + + llvm::Value *retVal = NULL; + ctx->SetDebugPos(pos); + if (ft->isTask) + ctx->LaunchInst(callee, argVals); + else { + // Most of the time, the mask is passed as the last argument. this + // isn't the case for things like SSE intrinsics and extern "C" + // functions from the application. + assert(callargs.size() + 1 == callee->arg_size() || + callargs.size() == callee->arg_size()); + + if (callargs.size() + 1 == callee->arg_size()) + argVals.push_back(ctx->GetMask()); + + retVal = ctx->CallInst(callee, argVals, isVoidFunc ? "" : "calltmp"); + } + + // For anything we had to do as pass by value/result, copy the + // corresponding reference values back out + for (unsigned int i = 0; i < storedArgValPtrs.size(); ++i) { + llvm::Value *ptr = storedArgValPtrs[i]; + if (ptr != NULL) { + const ReferenceType *rt = + dynamic_cast(callargs[i]->GetType()); + assert(rt != NULL); + llvm::Value *load = ctx->LoadInst(ptr, rt->GetReferenceTarget(), + "load_ref"); + // FIXME: apply the "don't do blending" optimization here if + // appropriate? + ctx->StoreInst(load, argValLValues[i], ctx->GetMask(), + rt->GetReferenceTarget()); + } + } + + // And jump out to the 'after funciton call' basic block + ctx->BranchInst(bAfter); + ctx->SetCurrentBasicBlock(bAfter); + + if (isVoidFunc) + return NULL; + + // The return value for the non-void case is either undefined or the + // function return value, depending on whether we actually ran the code + // path that called the function or not. + const llvm::Type *lrType = ft->GetReturnType()->LLVMType(g->ctx); + llvm::PHINode *ret = ctx->PhiNode(lrType, 2, "fun_ret"); + assert(retVal != NULL); + ret->addIncoming(llvm::UndefValue::get(lrType), bSkip); + ret->addIncoming(retVal, bDoCall); + return ret; +} + + +const Type * +FunctionCallExpr::GetType() const { + FunctionSymbolExpr *fse = dynamic_cast(func); + if (fse && fse->matchingFunc) { + const FunctionType *ft = + dynamic_cast(fse->matchingFunc->type); + assert(ft != NULL); + return ft->GetReturnType(); + } + else + return NULL; +} + + +Expr * +FunctionCallExpr::Optimize() { + if (func) + func = func->Optimize(); + if (args) + args = args->Optimize(); + if (!func || !args) + return NULL; + + return this; +} + + +Expr * +FunctionCallExpr::TypeCheck() { + if (func) { + func = func->TypeCheck(); + if (func != NULL) { + const FunctionType *ft = dynamic_cast(func->GetType()); + if (ft != NULL) { + if (ft->isTask) { + if (!isLaunch) + Error(pos, "\"launch\" expression needed to call function " + "with \"task\" qualifier."); + } + else if (isLaunch) + Error(pos, "\"launch\" expression illegal with non-\"task\"-" + "qualified function."); + } + else + Error(pos, "Valid function name must be used for function call."); + } + } + + if (args) + args = args->TypeCheck(); + + if (!func || !args) + return NULL; + return this; +} + + +void +FunctionCallExpr::Print() const { + if (!func || !args || !GetType()) + return; + + printf("[%s] funcall %s ", GetType()->GetString().c_str(), + isLaunch ? "launch" : ""); + func->Print(); + printf(" args ("); + args->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// ExprList + +llvm::Value * +ExprList::GetValue(FunctionEmitContext *ctx) const { + FATAL("ExprList::GetValue() should never be called"); + return NULL; +} + + +const Type * +ExprList::GetType() const { + FATAL("ExprList::GetType() should never be called"); + return NULL; +} + + +ExprList * +ExprList::Optimize() { + for (unsigned int i = 0; i < exprs.size(); ++i) + if (exprs[i]) + exprs[i] = exprs[i]->Optimize(); + return this; +} + + +ExprList * +ExprList::TypeCheck() { + for (unsigned int i = 0; i < exprs.size(); ++i) + if (exprs[i]) + exprs[i] = exprs[i]->TypeCheck(); + return this; +} + + +llvm::Constant * +ExprList::GetConstant(const Type *type) const { + const StructType *structType = dynamic_cast(type); + const SequentialType *sequentialType = + dynamic_cast(type); + + if (structType != NULL) { + // We can potentially return an llvm::ConstantStruct if we have the + // same number of elements in the ExprList as the struct has + // members (and the various elements line up with the shape of the + // corresponding struct elements). + if ((int)exprs.size() != structType->NumElements()) { + Error(pos, "Initializer list for struct \"%s\" must have %d " + "elements (has %d).", structType->GetString().c_str(), + (int)exprs.size(), structType->NumElements()); + return NULL; + } + + std::vector cv; + for (unsigned int i = 0; i < exprs.size(); ++i) { + if (exprs[i] == NULL) + return NULL; + const Type *elementType = structType->GetMemberType(i); + llvm::Constant *c = exprs[i]->GetConstant(elementType); + if (c == NULL) + // If this list element couldn't convert to the right + // constant type for the corresponding struct member, then + // give up + return NULL; + cv.push_back(c); + } + +#if defined(LLVM_2_8) || defined(LLVM_2_9) + return llvm::ConstantStruct::get(*g->ctx, cv, false); +#else + const llvm::StructType *llvmStructType = + llvm::dyn_cast(structType->LLVMType(g->ctx)); + assert(llvmStructType != NULL); + return llvm::ConstantStruct::get(llvmStructType, cv); +#endif + } + else if (sequentialType) { + // Similarly, if we have an array or vector type, we may be able to + // return the corresponding llvm constant value. + if ((int)exprs.size() != sequentialType->GetElementCount()) { + bool isArray = (dynamic_cast(type) != NULL); + Error(pos, "Initializer list for %s \"%s\" must have %d elements (has %d).", + isArray ? "array" : "vector", sequentialType->GetString().c_str(), + (int)exprs.size(), sequentialType->GetElementCount()); + return NULL; + } + + std::vector cv; + for (unsigned int i = 0; i < exprs.size(); ++i) { + if (exprs[i] == NULL) + return NULL; + const Type *elementType = sequentialType->GetElementType(); + llvm::Constant *c = exprs[i]->GetConstant(elementType); + if (c == NULL) + return NULL; + cv.push_back(c); + } + + const llvm::Type *lt = type->LLVMType(g->ctx); + const llvm::ArrayType *lat = llvm::dyn_cast(lt); + // FIXME: should the assert below validly fail for uniform vectors + // now? + assert(lat != NULL); + return llvm::ConstantArray::get(lat, cv); + } + return NULL; +} + + +void +ExprList::Print() const { + printf("expr list ("); + for (unsigned int i = 0; i < exprs.size(); ++i) { + if (exprs[i] != NULL) + exprs[i]->Print(); + printf("%s", (i == exprs.size() - 1) ? ")" : ", "); + } + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// IndexExpr + +IndexExpr::IndexExpr(Expr *a, Expr *i, SourcePos p) + : Expr(p) { + arrayOrVector = a; + index = i; +} + + +// FIXME: This is an ugly hack--if we're indexing into a uniform ispc +// VectorType, then this bitcasts the corresponding llvm::VectorType value +// to be a pointer to the vector's element type, so that a GEP to index +// from the pointer indices elements of the llvm::VectorType and doesn't +// incorrectly try to index into an array of llvm::VectorType instances. + +static llvm::Value * +lCastUniformVectorBasePtr(llvm::Value *ptr, FunctionEmitContext *ctx) { + const llvm::PointerType *baseType = + llvm::dyn_cast(ptr->getType()); + if (!baseType) + return ptr; + + const llvm::VectorType *baseEltVecType = + llvm::dyn_cast(baseType->getElementType()); + if (!baseEltVecType) + return ptr; + + const llvm::Type *vecEltType = baseEltVecType->getElementType(); + int numElts = baseEltVecType->getNumElements(); + const llvm::Type *castType = + llvm::PointerType::get(llvm::ArrayType::get(vecEltType, numElts), 0); + return ctx->BitCastInst(ptr, castType); +} + + +llvm::Value * +IndexExpr::GetValue(FunctionEmitContext *ctx) const { + const Type *arrayOrVectorType; + if (arrayOrVector == NULL || index == NULL || + ((arrayOrVectorType = arrayOrVector->GetType()) == NULL)) + return NULL; + + ctx->SetDebugPos(pos); + llvm::Value *lvalue = GetLValue(ctx); + if (!lvalue) { + // We may be indexing into a temporary that hasn't hit memory, so + // get the full value and stuff it into temporary alloca'd space so + // that we can index from there... + llvm::Value *val = arrayOrVector->GetValue(ctx); + if (val == NULL) { + assert(m->errorCount > 0); + return NULL; + } + ctx->SetDebugPos(pos); + llvm::Value *ptr = ctx->AllocaInst(arrayOrVectorType->LLVMType(g->ctx), + "array_tmp"); + ctx->StoreInst(val, ptr); + ptr = lCastUniformVectorBasePtr(ptr, ctx); + lvalue = ctx->GetElementPtrInst(ptr, LLVMInt32(0), index->GetValue(ctx)); + } + + ctx->SetDebugPos(pos); + return ctx->LoadInst(lvalue, GetType(), "index"); +} + + +const Type * +IndexExpr::GetType() const { + const Type *arrayOrVectorType, *indexType; + if (!arrayOrVector || !index || + ((arrayOrVectorType = arrayOrVector->GetType()) == NULL) || + ((indexType = index->GetType()) == NULL)) + return NULL; + + const SequentialType *sequentialType = + dynamic_cast(arrayOrVectorType->GetReferenceTarget()); + // Typechecking should have caught this... + assert(sequentialType != NULL); + + const Type *elementType = sequentialType->GetElementType(); + if (indexType->IsUniformType()) + // If the index is uniform, the resulting type is just whatever the + // element type is + return elementType; + else + // A varying index into uniform array/vector -> varying type (and + // same for varying array of course...) + return elementType->GetAsVaryingType(); +} + + +Symbol * +IndexExpr::GetBaseSymbol() const { + return arrayOrVector ? arrayOrVector->GetBaseSymbol() : NULL; +} + + +llvm::Value * +IndexExpr::GetLValue(FunctionEmitContext *ctx) const { + const Type *type; + if (!arrayOrVector || !index || ((type = arrayOrVector->GetType()) == NULL)) + return NULL; + + ctx->SetDebugPos(pos); + llvm::Value *basePtr = NULL; + if (dynamic_cast(type) || + dynamic_cast(type)) + basePtr = arrayOrVector->GetLValue(ctx); + else { + type = type->GetReferenceTarget(); + assert(dynamic_cast(type) || + dynamic_cast(type)); + basePtr = arrayOrVector->GetValue(ctx); + } + if (!basePtr) + return NULL; + + basePtr = lCastUniformVectorBasePtr(basePtr, ctx); + + ctx->SetDebugPos(pos); + return ctx->GetElementPtrInst(basePtr, LLVMInt32(0), index->GetValue(ctx)); +} + + +Expr * +IndexExpr::Optimize() { + if (arrayOrVector) + arrayOrVector = arrayOrVector->Optimize(); + if (index) + index = index->Optimize(); + if (arrayOrVector == NULL || index == NULL) + return NULL; + + return this; +} + + +Expr * +IndexExpr::TypeCheck() { + if (arrayOrVector) + arrayOrVector = arrayOrVector->TypeCheck(); + if (index) + index = index->TypeCheck(); + + if (!arrayOrVector || !index || !index->GetType()) + return NULL; + + const Type *arrayOrVectorType = arrayOrVector->GetType(); + if (!arrayOrVectorType) + return NULL; + + if (dynamic_cast(arrayOrVectorType->GetReferenceTarget()) == NULL) { + Error(pos, "Trying to index into non-array or vector type \"%s\".", + arrayOrVectorType->GetString().c_str()); + return NULL; + } + + bool isUniform = (index->GetType()->IsUniformType() && + !g->opt.disableUniformMemoryOptimizations); + const Type *indexType = isUniform ? AtomicType::UniformInt32 : + AtomicType::VaryingInt32; + index = index->TypeConv(indexType, "array index"); + if (!index) + return NULL; + + return this; +} + + +void +IndexExpr::Print() const { + if (!arrayOrVector || !index || !GetType()) + return; + + printf("[%s] index ", GetType()->GetString().c_str()); + arrayOrVector->Print(); + printf("["); + index->Print(); + printf("]"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// MemberExpr + +MemberExpr::MemberExpr(Expr *e, const char *id, SourcePos p, SourcePos idpos) + : Expr(p), identifierPos(idpos) { + expr = e; + identifier = id; +} + + +llvm::Value * +MemberExpr::GetValue(FunctionEmitContext *ctx) const { + if (!expr) + return NULL; + + llvm::Value *lvalue = GetLValue(ctx); + if (!lvalue) { + // As in the array case, this may be a temporary that hasn't hit + // memory; get the full value and stuff it into a temporary array + // so that we can index from there... + llvm::Value *val = expr->GetValue(ctx); + if (!val) { + assert(m->errorCount > 0); + return NULL; + } + ctx->SetDebugPos(pos); + const Type *exprType = expr->GetType(); + llvm::Value *ptr = ctx->AllocaInst(exprType->LLVMType(g->ctx), + "struct_tmp"); + ctx->StoreInst(val, ptr); + + int elementNumber = getElementNumber(); + if (elementNumber == -1) + return NULL; + lvalue = ctx->GetElementPtrInst(ptr, 0, elementNumber); + } + + ctx->SetDebugPos(pos); + return ctx->LoadInst(lvalue, GetType(), "structelement"); +} + + +const Type * +MemberExpr::GetType() const { + if (!expr) + return NULL; + + const Type *exprType = expr->GetType(); + if (!exprType) + return NULL; + + const StructType *structType = dynamic_cast(exprType); + const VectorType *vectorType = dynamic_cast(exprType); + if (!structType && !vectorType) { + const ReferenceType *referenceType = + dynamic_cast(exprType); + const Type *refTarget = (referenceType == NULL) ? NULL : + referenceType->GetReferenceTarget(); + if ((structType = dynamic_cast(refTarget)) == NULL && + (vectorType = dynamic_cast(refTarget)) == NULL) { + Error(pos, "Can't access member of non-struct/vector type \"%s\".", + exprType->GetString().c_str()); + return NULL; + } + } + + if (vectorType != NULL) + // only one-element vector selection is supported for now (i.e. no + // swizzling "foo.xxy"), so the result type is always just the + // element type. + return vectorType->GetElementType(); + else { + // Otherwise it's a struct, and the result type is the element + // type, possibly promoted to varying if the struct type / lvalue + // is varying. + const Type *elementType = structType->GetMemberType(identifier); + if (!elementType) + Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s", + identifier.c_str(), structType->GetString().c_str(), + getCandidateNearMatches().c_str()); + + if (exprType->IsVaryingType()) + return elementType->GetAsVaryingType(); + else + return elementType; + } +} + + +Symbol * +MemberExpr::GetBaseSymbol() const { + return expr ? expr->GetBaseSymbol() : NULL; +} + + +/** Map one character ids to vector element numbers. Allow a few different + conventions--xyzw, rgba, uv. + */ +static int +lIdentifierToVectorElement(char id) { + switch (id) { + case 'x': + case 'r': + case 'u': + return 0; + case 'y': + case 'g': + case 'v': + return 1; + case 'z': + case 'b': + return 2; + case 'w': + case 'a': + return 3; + default: + return -1; + } +} + + +int +MemberExpr::getElementNumber() const { + const Type *exprType; + if (!expr || ((exprType = expr->GetType()) == NULL)) + return -1; + + const StructType *structType = dynamic_cast(exprType); + const VectorType *vectorType = dynamic_cast(exprType); + if (!structType && !vectorType) { + const ReferenceType *referenceType = + dynamic_cast(exprType); + const Type *refTarget = (referenceType == NULL) ? NULL : + referenceType->GetReferenceTarget() ; + if ((structType = dynamic_cast(refTarget)) == NULL && + (vectorType = dynamic_cast(refTarget)) == NULL) + // FIXME: I think we shouldn't ever get here and that + // typechecking should have caught this case + return -1; + } + + int elementNumber = -1; + if (vectorType) { + if (identifier.size() != 1) { + Error(pos, "Only single-character vector element accessors are currently " + "supported--\"%s\" is invalid. Sorry.", identifier.c_str()); + } + else { + elementNumber = lIdentifierToVectorElement(identifier[0]); + if (elementNumber == -1) + Error(pos, "Vector element identifier \"%s\" unknown.", + identifier.c_str()); + } + } + else { + elementNumber = structType->GetMemberNumber(identifier); + if (elementNumber == -1) + Error(identifierPos, "Element name \"%s\" not present in struct type \"%s\".%s", + identifier.c_str(), structType->GetString().c_str(), + getCandidateNearMatches().c_str()); + } + return elementNumber; +} + + + +llvm::Value * +MemberExpr::GetLValue(FunctionEmitContext *ctx) const { + const Type *exprType; + if (!expr || ((exprType = expr->GetType()) == NULL)) + return NULL; + + ctx->SetDebugPos(pos); + const StructType *structType = dynamic_cast(exprType); + const VectorType *vectorType = dynamic_cast(exprType); + llvm::Value *basePtr = NULL; + if (structType || vectorType) + basePtr = expr->GetLValue(ctx); + else { + const ReferenceType *referenceType = dynamic_cast(exprType); + // FIXME: store structType and vectorType as members, or do all + // this in a separate function? This code to figure out + // struct/vectorType is replicated a bunch of times in + // MemberExpr... + const Type *refTarget = (referenceType == NULL) ? NULL : + referenceType->GetReferenceTarget() ; + if ((structType = dynamic_cast(refTarget)) == NULL && + (vectorType = dynamic_cast(refTarget)) == NULL) { + // FIXME: again I think typechecking should have caught this + Error(pos, "Can't access member of non-struct/vector type \"%s\".", + exprType->GetString().c_str()); + return NULL; + } + basePtr = expr->GetValue(ctx); + } + if (!basePtr) + return NULL; + + int elementNumber = getElementNumber(); + if (elementNumber == -1) + return NULL; + + ctx->SetDebugPos(pos); + return ctx->GetElementPtrInst(basePtr, 0, elementNumber); +} + + +Expr * +MemberExpr::TypeCheck() { + if (expr) + expr = expr->TypeCheck(); + return expr ? this : NULL; +} + + +Expr * +MemberExpr::Optimize() { + if (expr) + expr = expr->Optimize(); + return expr ? this : NULL; +} + + +void +MemberExpr::Print() const { + if (!expr || !GetType()) + return; + + printf("[%s] member (", GetType()->GetString().c_str()); + expr->Print(); + printf(" . %s)", identifier.c_str()); + pos.Print(); +} + + +/** There is no structure member with the name we've got in "identifier". + Use the approximate string matching routine to see if the identifier is + a minor misspelling of one of the ones that is there. + */ +std::string +MemberExpr::getCandidateNearMatches() const { + const StructType *structType = + dynamic_cast(expr->GetType()); + if (!structType) + return ""; + + std::vector elementNames; + for (int i = 0; i < structType->NumElements(); ++i) + elementNames.push_back(structType->GetElementName(i)); + std::vector alternates = MatchStrings(identifier, elementNames); + if (!alternates.size()) + return ""; + + std::string ret = " Did you mean "; + for (unsigned int i = 0; i < alternates.size(); ++i) { + ret += std::string("\"") + alternates[i] + std::string("\""); + if (i < alternates.size() - 1) ret += ", or "; + } + ret += "?"; + return ret; +} + + +/////////////////////////////////////////////////////////////////////////// +// ConstExpr + +ConstExpr::ConstExpr(const Type *t, int32_t i, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstInt32); + int32Val[0] = i; +} + + +ConstExpr::ConstExpr(const Type *t, int32_t *i, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstInt32 || + type == AtomicType::VaryingConstInt32); + for (int j = 0; j < Count(); ++j) + int32Val[j] = i[j]; +} + + +ConstExpr::ConstExpr(const Type *t, uint32_t u, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstUInt32); + uint32Val[0] = u; +} + + +ConstExpr::ConstExpr(const Type *t, uint32_t *u, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstUInt32 || + type == AtomicType::VaryingConstUInt32); + for (int j = 0; j < Count(); ++j) + uint32Val[j] = u[j]; +} + + +ConstExpr::ConstExpr(const Type *t, float f, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstFloat); + floatVal[0] = f; +} + + +ConstExpr::ConstExpr(const Type *t, float *f, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstFloat || + type == AtomicType::VaryingConstFloat); + for (int j = 0; j < Count(); ++j) + floatVal[j] = f[j]; +} + + +ConstExpr::ConstExpr(const Type *t, int64_t i, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstInt64); + int64Val[0] = i; +} + + +ConstExpr::ConstExpr(const Type *t, int64_t *i, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstInt64 || + type == AtomicType::VaryingConstInt64); + for (int j = 0; j < Count(); ++j) + int64Val[j] = i[j]; +} + + +ConstExpr::ConstExpr(const Type *t, uint64_t u, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformUInt64); + uint64Val[0] = u; +} + + +ConstExpr::ConstExpr(const Type *t, uint64_t *u, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstUInt64 || + type == AtomicType::VaryingConstUInt64); + for (int j = 0; j < Count(); ++j) + uint64Val[j] = u[j]; +} + + +ConstExpr::ConstExpr(const Type *t, double f, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstDouble); + doubleVal[0] = f; +} + + +ConstExpr::ConstExpr(const Type *t, double *f, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstDouble || + type == AtomicType::VaryingConstDouble); + for (int j = 0; j < Count(); ++j) + doubleVal[j] = f[j]; +} + + +ConstExpr::ConstExpr(const Type *t, bool b, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstBool); + boolVal[0] = b; +} + + +ConstExpr::ConstExpr(const Type *t, bool *b, SourcePos p) + : Expr(p) { + type = dynamic_cast(t); + assert(type != NULL); + type = type->GetAsConstType(); + assert(type == AtomicType::UniformConstBool || + type == AtomicType::VaryingConstBool); + for (int j = 0; j < Count(); ++j) + boolVal[j] = b[j]; +} + + +ConstExpr::ConstExpr(ConstExpr *old, double *v) + : Expr(old->pos) { + type = old->type; + switch (type->basicType) { + case AtomicType::TYPE_BOOL: + for (int i = 0; i < Count(); ++i) + boolVal[i] = (v[i] != 0.); + break; + case AtomicType::TYPE_INT32: + for (int i = 0; i < Count(); ++i) + int32Val[i] = (int)v[i]; + break; + case AtomicType::TYPE_UINT32: + for (int i = 0; i < Count(); ++i) + uint32Val[i] = (unsigned int)v[i]; + break; + case AtomicType::TYPE_FLOAT: + for (int i = 0; i < Count(); ++i) + floatVal[i] = (float)v[i]; + break; + case AtomicType::TYPE_DOUBLE: + for (int i = 0; i < Count(); ++i) + doubleVal[i] = v[i]; + break; + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + FATAL("fixme; we need another constructor so that we're not trying to pass " + "double values to init an int64 type..."); + default: + FATAL("unimplemented const type"); + } +} + + +const Type * +ConstExpr::GetType() const { + return type; +} + + +llvm::Value * +ConstExpr::GetValue(FunctionEmitContext *ctx) const { + ctx->SetDebugPos(pos); + bool isVarying = type->IsVaryingType(); + + // ConstExpr only represents atomic types; just dispatch out to the + // appropriate utility routine to get the llvm constant value of the + // type we need. + switch (type->basicType) { + case AtomicType::TYPE_BOOL: + if (isVarying) + return LLVMBoolVector(boolVal); + else + return boolVal[0] ? LLVMTrue : LLVMFalse; + case AtomicType::TYPE_INT32: + return isVarying ? LLVMInt32Vector(int32Val) : + LLVMInt32(int32Val[0]); + case AtomicType::TYPE_UINT32: + return isVarying ? LLVMUInt32Vector(uint32Val) : + LLVMUInt32(uint32Val[0]); + case AtomicType::TYPE_FLOAT: + return isVarying ? LLVMFloatVector(floatVal) : + LLVMFloat(floatVal[0]); + case AtomicType::TYPE_INT64: + return isVarying ? LLVMInt64Vector(int64Val) : + LLVMInt64(int64Val[0]); + case AtomicType::TYPE_UINT64: + return isVarying ? LLVMUInt64Vector(uint64Val) : + LLVMUInt64(uint64Val[0]); + case AtomicType::TYPE_DOUBLE: + return isVarying ? LLVMDoubleVector(doubleVal) : + LLVMDouble(doubleVal[0]); + default: + FATAL("unimplemented const type"); + return NULL; + } +} + + +/* Type conversion templates: take advantage of C++ function overloading + rules to get the one we want to match. */ + +/* First the most general case, just use C++ type conversion if nothing + else matches */ +template static inline void +lConvertElement(From from, To *to) { + *to = (To)from; +} + + +/** When converting from bool types to numeric types, make sure the result + is one or zero. + FIXME: this is a different rule than we use elsewhere, where we sign extend + the bool. We should fix the other case to just zero extend and then + patch up places in the stdlib that depend on sign extension to call a + routine to make that happen. + */ +template static inline void +lConvertElement(bool from, To *to) { + *to = from ? (To)1 : (To)0; +} + + +/** When converting numeric types to bool, compare to zero. (Do we + actually need this one??) */ +template static inline void +lConvertElement(From from, bool *to) { + *to = (from != 0); +} + + +/** And bool -> bool is just assignment */ +static inline void +lConvertElement(bool from, bool *to) { + *to = from; +} + + +/** Type conversion utility function + */ +template static void +lConvert(const From *from, To *to, int count, bool forceVarying) { + for (int i = 0; i < count; ++i) + lConvertElement(from[i], &to[i]); + + if (forceVarying && count == 1) + for (int i = 1; i < g->target.vectorWidth; ++i) + to[i] = to[0]; +} + + +int +ConstExpr::AsInt64(int64_t *ip, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsUInt64(uint64_t *up, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsDouble(double *d, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, d, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, d, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, d, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, d, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, d, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, d, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, d, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsFloat(float *fp, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, fp, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, fp, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, fp, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, fp, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, fp, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, fp, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, fp, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsBool(bool *b, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, b, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, b, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, b, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, b, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, b, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, b, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, b, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsInt32(int32_t *ip, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, ip, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, ip, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, ip, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::AsUInt32(uint32_t *up, bool forceVarying) const { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: lConvert(boolVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_INT32: lConvert(int32Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_UINT32: lConvert(uint32Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_FLOAT: lConvert(floatVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_DOUBLE: lConvert(doubleVal, up, Count(), forceVarying); break; + case AtomicType::TYPE_INT64: lConvert(int64Val, up, Count(), forceVarying); break; + case AtomicType::TYPE_UINT64: lConvert(uint64Val, up, Count(), forceVarying); break; + default: + FATAL("unimplemented const type"); + } + return Count(); +} + + +int +ConstExpr::Count() const { + return GetType()->IsVaryingType() ? g->target.vectorWidth : 1; +} + + +llvm::Constant * +ConstExpr::GetConstant(const Type *type) const { + // Caller shouldn't be trying to stuff a varying value here into a + // constant type. + if (type->IsUniformType()) + assert(Count() == 1); + + type = type->GetAsNonConstType(); + if (type == AtomicType::UniformBool || type == AtomicType::VaryingBool) { + bool bv[ISPC_MAX_NVEC]; + AsBool(bv, type->IsVaryingType()); + if (type->IsUniformType()) + return bv[0] ? LLVMTrue : LLVMFalse; + else + return LLVMBoolVector(bv); + } + else if (type == AtomicType::UniformInt32 || type == AtomicType::VaryingInt32) { + int32_t iv[ISPC_MAX_NVEC]; + AsInt32(iv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMInt32(iv[0]); + else + return LLVMInt32Vector(iv); + } + else if (type == AtomicType::UniformUInt32 || type == AtomicType::VaryingUInt32) { + uint32_t uiv[ISPC_MAX_NVEC]; + AsUInt32(uiv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMUInt32(uiv[0]); + else + return LLVMUInt32Vector(uiv); + } + else if (type == AtomicType::UniformFloat || type == AtomicType::VaryingFloat) { + float fv[ISPC_MAX_NVEC]; + AsFloat(fv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMFloat(fv[0]); + else + return LLVMFloatVector(fv); + } + else if (type == AtomicType::UniformInt64 || type == AtomicType::VaryingInt64) { + int64_t iv[ISPC_MAX_NVEC]; + AsInt64(iv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMInt64(iv[0]); + else + return LLVMInt64Vector(iv); + } + else if (type == AtomicType::UniformUInt64 || type == AtomicType::VaryingUInt64) { + uint64_t uiv[ISPC_MAX_NVEC]; + AsUInt64(uiv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMUInt64(uiv[0]); + else + return LLVMUInt64Vector(uiv); + } + else if (type == AtomicType::UniformDouble || type == AtomicType::VaryingDouble) { + double dv[ISPC_MAX_NVEC]; + AsDouble(dv, type->IsVaryingType()); + if (type->IsUniformType()) + return LLVMDouble(dv[0]); + else + return LLVMDoubleVector(dv); + } + else { + FATAL("unexpected type in ConstExpr::GetConstant()"); + return NULL; + } +} + + +Expr * +ConstExpr::Optimize() { + return this; +} + + +Expr * +ConstExpr::TypeCheck() { + return this; +} + + + +void +ConstExpr::Print() const { + printf("[%s] (", GetType()->GetString().c_str()); + for (int i = 0; i < Count(); ++i) { + switch (type->basicType) { + case AtomicType::TYPE_BOOL: + printf("%s", boolVal[i] ? "true" : "false"); + break; + case AtomicType::TYPE_INT32: + printf("%d", int32Val[i]); + break; + case AtomicType::TYPE_UINT32: + printf("%u", uint32Val[i]); + break; + case AtomicType::TYPE_FLOAT: + printf("%f", floatVal[i]); + break; + case AtomicType::TYPE_INT64: +#ifdef ISPC_IS_LINUX + printf("%ld", int64Val[i]); +#else + printf("%lld", int64Val[i]); +#endif + break; + case AtomicType::TYPE_UINT64: +#ifdef ISPC_IS_LINUX + printf("%lu", uint64Val[i]); +#else + printf("%llu", uint64Val[i]); +#endif + break; + case AtomicType::TYPE_DOUBLE: + printf("%f", doubleVal[i]); + break; + default: + FATAL("unimplemented const type"); + } + if (i != Count() - 1) + printf(", "); + } + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// TypeCastExpr + +TypeCastExpr::TypeCastExpr(const Type *t, Expr *e, SourcePos p) + : Expr(p) { + type = t; + expr = e; +} + + +/** Handle all the grungy details of type conversion between atomic types. + Given an input value in exprVal of type fromType, convert it to the + llvm::Value with type toType. + */ +static llvm::Value * +lTypeConvAtomic(FunctionEmitContext *ctx, llvm::Value *exprVal, + const AtomicType *toType, const AtomicType *fromType, + SourcePos pos) { + llvm::Value *cast = NULL; + + switch (toType->basicType) { + case AtomicType::TYPE_FLOAT: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::FloatType : + LLVMTypes::FloatVectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + // If we have a bool vector of i32 element,s first truncate + // down to a single bit + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + // And then do an unisgned int->float cast + cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int + exprVal, targetType, "bool2float"); + break; + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_INT64: + cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int to float + exprVal, targetType, "int2float"); + break; + case AtomicType::TYPE_UINT32: + case AtomicType::TYPE_UINT64: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from unsigned int to float is slow. " + "Use \"int\" if possible"); + cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to float + exprVal, targetType, "uint2float"); + break; + case AtomicType::TYPE_FLOAT: + // No-op cast. + cast = exprVal; + break; + case AtomicType::TYPE_DOUBLE: + cast = ctx->FPCastInst(exprVal, targetType, "double2float"); + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_DOUBLE: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::DoubleType : + LLVMTypes::DoubleVectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + // truncate i32 bool vector values to i1s + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int to double + exprVal, targetType, "bool2double"); + break; + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_INT64: + cast = ctx->CastInst(llvm::Instruction::SIToFP, // signed int + exprVal, targetType, "int2double"); + break; + case AtomicType::TYPE_UINT32: + case AtomicType::TYPE_UINT64: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from unsigned int64 to float is slow. " + "Use \"int64\" if possible"); + cast = ctx->CastInst(llvm::Instruction::UIToFP, // unsigned int + exprVal, targetType, "uint2double"); + break; + case AtomicType::TYPE_FLOAT: + cast = ctx->FPCastInst(exprVal, targetType, "float2double"); + break; + case AtomicType::TYPE_DOUBLE: + cast = exprVal; + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_INT32: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::Int32Type : + LLVMTypes::Int32VectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + // FIXME: we're currently doing sign extension rather than zero + // extension here, which means that ints will have the value + // 0xffffffff for 'true' bools (versus the value 1). There is + // some code in stdlib.ispc that depends on bool->int conversions + // having this behavior that needs to be cleaned up (e.g. to + // call a __sext() builtin to do bool->int conversions) if we + // are going to fix this here. + cast = ctx->SExtInst(exprVal, targetType, "bool2int"); + break; + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_UINT32: + cast = exprVal; + break; + case AtomicType::TYPE_FLOAT: + cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int + exprVal, targetType, "float2int"); + break; + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + cast = ctx->TruncInst(exprVal, targetType, "int64_to_int32"); + break; + case AtomicType::TYPE_DOUBLE: + cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int + exprVal, targetType, "double2int"); + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_UINT32: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::Int32Type : + LLVMTypes::Int32VectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + // FIXME: See comments above w.r.t. fixing this to be a + // ZExtInst rather than an SExtInst... + cast = ctx->SExtInst(exprVal, targetType, "bool2uint"); + break; + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_UINT32: + cast = exprVal; + break; + case AtomicType::TYPE_FLOAT: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from float to unsigned int is slow. " + "Use \"int\" if possible"); + cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int + exprVal, targetType, "float2uint"); + break; + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + cast = ctx->TruncInst(exprVal, targetType, "int64_to_uint32"); + break; + case AtomicType::TYPE_DOUBLE: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from double to unsigned int is slow. " + "Use \"int\" if possible"); + cast = ctx->CastInst(llvm::Instruction::FPToUI, // unsigned int + exprVal, targetType, "double2uint"); + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_INT64: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::Int64Type : + LLVMTypes::Int64VectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + cast = ctx->SExtInst(exprVal, targetType, "bool2int64"); + break; + case AtomicType::TYPE_INT32: + cast = ctx->SExtInst(exprVal, targetType, "int32_to_int64"); + break; + case AtomicType::TYPE_UINT32: + cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_int64"); + break; + case AtomicType::TYPE_FLOAT: + cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int + exprVal, targetType, "float2int64"); + break; + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + cast = exprVal; + break; + case AtomicType::TYPE_DOUBLE: + cast = ctx->CastInst(llvm::Instruction::FPToSI, // signed int + exprVal, targetType, "double2int"); + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_UINT64: { + const llvm::Type *targetType = + fromType->IsUniformType() ? LLVMTypes::Int64Type : + LLVMTypes::Int64VectorType; + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + if (fromType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + exprVal = ctx->TruncInst(exprVal, LLVMTypes::Int1VectorType, "bool_to_i1"); + cast = ctx->SExtInst(exprVal, targetType, "bool2uint"); + break; + case AtomicType::TYPE_INT32: + cast = ctx->SExtInst(exprVal, targetType, "int32_to_uint64"); + break; + case AtomicType::TYPE_UINT32: + cast = ctx->ZExtInst(exprVal, targetType, "uint32_to_uint64"); + break; + case AtomicType::TYPE_FLOAT: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from float to unsigned int64 is slow. " + "Use \"int64\" if possible"); + cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int + exprVal, targetType, "float2uint"); + break; + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: + cast = exprVal; + break; + case AtomicType::TYPE_DOUBLE: + if (fromType->IsVaryingType()) + PerformanceWarning(pos, "Conversion from double to unsigned int64 is slow. " + "Use \"int64\" if possible"); + cast = ctx->CastInst(llvm::Instruction::FPToUI, // signed int + exprVal, targetType, "double2uint"); + break; + default: + FATAL("unimplemented"); + } + break; + } + case AtomicType::TYPE_BOOL: { + switch (fromType->basicType) { + case AtomicType::TYPE_BOOL: + cast = exprVal; + break; + case AtomicType::TYPE_INT32: + case AtomicType::TYPE_UINT32: { + llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt32(0) : + (llvm::Value *)LLVMInt32Vector(0); + cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, + exprVal, zero, "cmpi0"); + break; + } + case AtomicType::TYPE_FLOAT: { + llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMFloat(0.f) : + (llvm::Value *)LLVMFloatVector(0.f); + cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE, + exprVal, zero, "cmpf0"); + break; + } + case AtomicType::TYPE_INT64: + case AtomicType::TYPE_UINT64: { + llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMInt64(0) : + (llvm::Value *)LLVMInt64Vector((int64_t)0); + cast = ctx->CmpInst(llvm::Instruction::ICmp, llvm::CmpInst::ICMP_NE, + exprVal, zero, "cmpi0"); + break; + } + case AtomicType::TYPE_DOUBLE: { + llvm::Value *zero = fromType->IsUniformType() ? (llvm::Value *)LLVMDouble(0.) : + (llvm::Value *)LLVMDoubleVector(0.); + cast = ctx->CmpInst(llvm::Instruction::FCmp, llvm::CmpInst::FCMP_ONE, + exprVal, zero, "cmpd0"); + break; + } + default: + FATAL("unimplemented"); + } + + if (fromType->IsUniformType()) { + if (toType->IsVaryingType() && + LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) { + // extend out to i32 bool values from i1 here. then we'll + // turn into a vector below, the way it does for everyone + // else... + cast = ctx->SExtInst(cast, LLVMTypes::BoolVectorType->getElementType(), + "i1bool_to_i32bool"); + } + } + else + // fromType->IsVaryingType()) + cast = ctx->I1VecToBoolVec(cast); + + break; + } + default: + FATAL("unimplemented"); + } + + // If we also want to go from uniform to varying, replicate out the + // value across the vector elements.. + if (toType->IsVaryingType() && fromType->IsUniformType()) { + const llvm::Type *vtype = toType->LLVMType(g->ctx); + llvm::Value *castVec = llvm::UndefValue::get(vtype); + for (int i = 0; i < g->target.vectorWidth; ++i) + castVec = ctx->InsertInst(castVec, cast, i, "smearinsert"); + return castVec; + } + else + return cast; +} + + +/** Converts the given value of the given type to be the varying + equivalent, returning the resulting value. + */ +static llvm::Value * +lUniformValueToVarying(FunctionEmitContext *ctx, llvm::Value *value, + const Type *type) { + // nothing to do if it's already varying + if (type->IsVaryingType()) + return value; + + const llvm::Type *llvmType = type->GetAsVaryingType()->LLVMType(g->ctx); + llvm::Value *retValue = llvm::UndefValue::get(llvmType); + + // for structs, just recursively make their elements varying (if + // needed) and populate the return struct + const StructType *structType = dynamic_cast(type); + if (structType != NULL) { + for (int i = 0; i < structType->NumElements(); ++i) { + llvm::Value *v = ctx->ExtractInst(value, i, "struct_element"); + v = lUniformValueToVarying(ctx, v, structType->GetMemberType(i)); + retValue = ctx->InsertInst(retValue, v, i, "set_struct_element"); + } + return retValue; + } + + // And similarly do the elements of arrays and vectors individually + const SequentialType *sequentialType = + dynamic_cast(type); + if (sequentialType != NULL) { + for (int i = 0; i < sequentialType->GetElementCount(); ++i) { + llvm::Value *v = ctx->ExtractInst(value, i, "get_element"); + v = lUniformValueToVarying(ctx, v, sequentialType->GetElementType()); + retValue = ctx->InsertInst(retValue, v, i, "set_element"); + } + return retValue; + } + + // Otherwise we must have a uniform AtomicType, so smear its value + // across the vector lanes. + assert(dynamic_cast(type) != NULL); + for (int i = 0; i < g->target.vectorWidth; ++i) + retValue = ctx->InsertInst(retValue, value, i, "smearinsert"); + return retValue; +} + + + +llvm::Value * +TypeCastExpr::GetValue(FunctionEmitContext *ctx) const { + if (!expr) + return NULL; + + ctx->SetDebugPos(pos); + const Type *toType = GetType(), *fromType = expr->GetType(); + if (!toType || !fromType || toType == AtomicType::Void || + fromType == AtomicType::Void) + // an error should have been issued elsewhere in this case + return NULL; + + if (Type::Equal(toType->GetAsConstType(), fromType->GetAsConstType())) + // There's nothing to do, just return the value. (LLVM's type + // system doesn't worry about constiness.) + return expr->GetValue(ctx); + + // This also should be caught during typechecking + assert(!(toType->IsUniformType() && fromType->IsVaryingType())); + + const ReferenceType *toReference = dynamic_cast(toType); + const ReferenceType *fromReference = dynamic_cast(fromType); + if (toReference && fromReference) { + const Type *toTarget = toReference->GetReferenceTarget(); + const Type *fromTarget = fromReference->GetReferenceTarget(); + + const ArrayType *toArray = dynamic_cast(toTarget); + const ArrayType *fromArray = dynamic_cast(fromTarget); + if (toArray && fromArray) { + // cast array pointer from [n x foo] to [0 x foo] if needed to be able + // to pass to a function that takes an unsized array as a parameter + if(toArray->GetElementCount() != 0 && + (toArray->GetElementCount() != fromArray->GetElementCount())) + Warning(pos, "Type-converting array of length %d to length %d", + fromArray->GetElementCount(), toArray->GetElementCount()); + assert(Type::Equal(toArray->GetBaseType()->GetAsConstType(), + fromArray->GetBaseType()->GetAsConstType())); + llvm::Value *v = expr->GetValue(ctx); + const llvm::Type *ptype = toType->LLVMType(g->ctx); + return ctx->BitCastInst(v, ptype); //, "array_cast_0size"); + } + + assert(Type::Equal(toTarget, fromTarget) || + Type::Equal(toTarget, fromTarget->GetAsConstType())); + return expr->GetValue(ctx); + } + + const StructType *toStruct = dynamic_cast(toType); + const StructType *fromStruct = dynamic_cast(fromType); + if (toStruct && fromStruct) { + // The only legal type conversions for structs are to go from a + // uniform to a varying instance of the same struct type. + assert(toStruct->IsVaryingType() && fromStruct->IsUniformType() && + Type::Equal(toStruct, fromStruct->GetAsVaryingType())); + + llvm::Value *origValue = expr->GetValue(ctx); + if (!origValue) + return NULL; + return lUniformValueToVarying(ctx, origValue, fromType); + } + + const VectorType *toVector = dynamic_cast(toType); + const VectorType *fromVector = dynamic_cast(fromType); + if (toVector && fromVector) { + // this should be caught during typechecking + assert(toVector->GetElementCount() == fromVector->GetElementCount()); + + llvm::Value *exprVal = expr->GetValue(ctx); + if (!exprVal) + return NULL; + + // Emit instructions to do type conversion of each of the elements + // of the vector. + // FIXME: since uniform vectors are represented as + // llvm::VectorTypes, we should just be able to issue the + // corresponding vector type convert, which should be more + // efficient by avoiding serialization! + llvm::Value *cast = llvm::UndefValue::get(toType->LLVMType(g->ctx)); + for (int i = 0; i < toVector->GetElementCount(); ++i) { + llvm::Value *ei = ctx->ExtractInst(exprVal, i); + + llvm::Value *conv = lTypeConvAtomic(ctx, ei, toVector->GetElementType(), + fromVector->GetElementType(), pos); + if (!conv) + return NULL; + cast = ctx->InsertInst(cast, conv, i); + } + return cast; + } + + const AtomicType *fromAtomic = dynamic_cast(fromType); + // at this point, coming from an atomic type is all that's left... + assert(fromAtomic != NULL); + + llvm::Value *exprVal = expr->GetValue(ctx); + if (!exprVal) + return NULL; + + if (toVector) { + // scalar -> short vector conversion + llvm::Value *conv = lTypeConvAtomic(ctx, exprVal, toVector->GetElementType(), + fromAtomic, pos); + if (!conv) + return NULL; + + llvm::Value *cast = llvm::UndefValue::get(toType->LLVMType(g->ctx)); + for (int i = 0; i < toVector->GetElementCount(); ++i) + cast = ctx->InsertInst(cast, conv, i); + return cast; + } + else { + const AtomicType *toAtomic = dynamic_cast(toType); + // typechecking should ensure this is the case + assert(toAtomic != NULL); + + return lTypeConvAtomic(ctx, exprVal, toAtomic, fromAtomic, pos); + } +} + + +const Type * +TypeCastExpr::GetType() const { + return type; +} + + +Expr * +TypeCastExpr::TypeCheck() { + if (expr != NULL) + expr = expr->TypeCheck(); + if (expr == NULL) + return NULL; + + const Type *toType = GetType(), *fromType = expr->GetType(); + if (toType == NULL || fromType == NULL) + return NULL; + + const char *toTypeString = toType->GetString().c_str(); + const char *fromTypeString = fromType->GetString().c_str(); + + // It's an error to attempt to convert from varying to uniform + if (toType->IsUniformType() && !fromType->IsUniformType()) { + Error(pos, "Can't assign 'varying' value to 'uniform' type \"%s\".", + toTypeString); + return NULL; + } + + // And any kind of void type in a type cast doesn't make sense + if (toType == AtomicType::Void || fromType == AtomicType::Void) { + Error(pos, "Void type illegal in type cast from type \"%s\" to " + "type \"%s\".", fromTypeString, toTypeString); + return NULL; + } + + // FIXME: do we need to worry more about references here? + + if (dynamic_cast(fromType) != NULL) { + // Starting from a vector type; the result type must be a vector + // type as well + if (dynamic_cast(toType) == NULL) { + Error(pos, "Can't convert vector type \"%s\" to non-vector type \"%s\".", + fromTypeString, toTypeString); + return NULL; + } + + // And the two vectors must have the same number of elements + if (dynamic_cast(toType)->GetElementCount() != + dynamic_cast(fromType)->GetElementCount()) { + Error(pos, "Can't convert vector type \"%s\" to differently-sized " + "vector type \"%s\".", fromTypeString, toTypeString); + return NULL; + } + + // And we're ok; since vectors can only hold AtomicTypes, we know + // that type converting the elements will work. + return this; + } + else if (dynamic_cast(fromType) != NULL) { + FATAL("Shouldn't ever get here"); + return this; + } + else { + assert(dynamic_cast(fromType) != NULL); + // If we're going from an atomic type, the only possible result is + // another atomic type + if (dynamic_cast(toType) == NULL) { + Error(pos, "Can't convert from non-atomic type \"%s\" to \"%s\".", + fromTypeString, toTypeString); + return NULL; + } + + return this; + } + +} + + +Expr * +TypeCastExpr::Optimize() { + if (expr != NULL) + expr = expr->Optimize(); + if (expr == NULL) + return NULL; + + ConstExpr *constExpr = dynamic_cast(expr); + if (!constExpr) + // We can't do anything if this isn't a const expr + return this; + + const Type *toType = GetType(); + const AtomicType *toAtomic = dynamic_cast(toType); + // If we're not casting to an atomic type, we can't do anything here, + // since ConstExprs can only represent atomic types. (So e.g. we're + // casting from an int to an int<4>.) + if (toAtomic == NULL) + return this; + + bool forceVarying = toType->IsVaryingType(); + + // All of the type conversion smarts we need is already in the + // ConstExpr::AsBool(), etc., methods, so we just need to call the + // appropriate one for the type that this cast is converting to. + switch (toAtomic->basicType) { + case AtomicType::TYPE_BOOL: { + bool bv[ISPC_MAX_NVEC]; + constExpr->AsBool(bv, forceVarying); + return new ConstExpr(toType, bv, pos); + } + case AtomicType::TYPE_INT32: { + int32_t iv[ISPC_MAX_NVEC]; + constExpr->AsInt32(iv, forceVarying); + return new ConstExpr(toType, iv, pos); + } + case AtomicType::TYPE_UINT32: { + uint32_t uv[ISPC_MAX_NVEC]; + constExpr->AsUInt32(uv, forceVarying); + return new ConstExpr(toType, uv, pos); + } + case AtomicType::TYPE_FLOAT: { + float fv[ISPC_MAX_NVEC]; + constExpr->AsFloat(fv, forceVarying); + return new ConstExpr(toType, fv, pos); + } + case AtomicType::TYPE_INT64: { + int64_t iv[ISPC_MAX_NVEC]; + constExpr->AsInt64(iv, forceVarying); + return new ConstExpr(toType, iv, pos); + } + case AtomicType::TYPE_UINT64: { + uint64_t uv[ISPC_MAX_NVEC]; + constExpr->AsUInt64(uv, forceVarying); + return new ConstExpr(toType, uv, pos); + } + case AtomicType::TYPE_DOUBLE: { + double dv[ISPC_MAX_NVEC]; + constExpr->AsDouble(dv, forceVarying); + return new ConstExpr(toType, dv, pos); + } + default: + FATAL("unimplemented"); + } + return this; + +} + + +void +TypeCastExpr::Print() const { + printf("[%s] type cast (", GetType()->GetString().c_str()); + expr->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// ReferenceExpr + +ReferenceExpr::ReferenceExpr(Expr *e, SourcePos p) + : Expr(p) { + expr = e; +} + + +llvm::Value * +ReferenceExpr::GetValue(FunctionEmitContext *ctx) const { + ctx->SetDebugPos(pos); + return expr ? expr->GetLValue(ctx) : NULL; +} + + +Symbol * +ReferenceExpr::GetBaseSymbol() const { + return expr ? expr->GetBaseSymbol() : NULL; +} + + +const Type * +ReferenceExpr::GetType() const { + if (!expr) + return NULL; + + const Type *type = expr->GetType(); + if (!type) + return NULL; + + return new ReferenceType(type, false); +} + + +Expr * +ReferenceExpr::Optimize() { + if (expr) + expr = expr->Optimize(); + if (expr == NULL) + return NULL; + + return this; +} + + +Expr * +ReferenceExpr::TypeCheck() { + if (expr != NULL) + expr = expr->TypeCheck(); + if (expr == NULL) + return NULL; + return this; +} + + +void +ReferenceExpr::Print() const { + if (expr == NULL || GetType() == NULL) + return; + + printf("[%s] &(", GetType()->GetString().c_str()); + expr->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// DereferenceExpr + +DereferenceExpr::DereferenceExpr(Expr *e, SourcePos p) + : Expr(p) { + expr = e; +} + + +llvm::Value * +DereferenceExpr::GetValue(FunctionEmitContext *ctx) const { + if (expr == NULL) + return NULL; + llvm::Value *ptr = expr->GetValue(ctx); + if (ptr == NULL) + return NULL; + const Type *type = GetType(); + if (type == NULL) + return NULL; + + ctx->SetDebugPos(pos); + return ctx->LoadInst(ptr, type, "reference_load"); +} + + +llvm::Value * +DereferenceExpr::GetLValue(FunctionEmitContext *ctx) const { + if (expr == NULL) + return NULL; + return expr->GetValue(ctx); +} + + +Symbol * +DereferenceExpr::GetBaseSymbol() const { + return expr ? expr->GetBaseSymbol() : NULL; +} + + +const Type * +DereferenceExpr::GetType() const { + return (expr && expr->GetType()) ? expr->GetType()->GetReferenceTarget() : + NULL; +} + + +Expr * +DereferenceExpr::TypeCheck() { + if (expr != NULL) + expr = expr->TypeCheck(); + if (expr == NULL) + return NULL; + return this; +} + + +Expr * +DereferenceExpr::Optimize() { + if (expr != NULL) + expr = expr->Optimize(); + if (expr == NULL) + return NULL; + return this; +} + + +void +DereferenceExpr::Print() const { + if (expr == NULL || GetType() == NULL) + return; + + printf("[%s] *(", GetType()->GetString().c_str()); + expr->Print(); + printf(")"); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// SymbolExpr + +SymbolExpr::SymbolExpr(Symbol *s, SourcePos p) + : Expr(p) { + symbol = s; +} + + +llvm::Value * +SymbolExpr::GetValue(FunctionEmitContext *ctx) const { + // storagePtr may be NULL due to an earlier compilation error + if (!symbol || !symbol->storagePtr) + return NULL; + ctx->SetDebugPos(pos); + return ctx->LoadInst(symbol->storagePtr, GetType(), symbol->name.c_str()); +} + + +llvm::Value * +SymbolExpr::GetLValue(FunctionEmitContext *ctx) const { + if (symbol == NULL) + return NULL; + ctx->SetDebugPos(pos); + return symbol->storagePtr; +} + + +Symbol * +SymbolExpr::GetBaseSymbol() const { + return symbol; +} + + +const Type * +SymbolExpr::GetType() const { + return symbol ? symbol->type : NULL; +} + + +Expr * +SymbolExpr::TypeCheck() { + return this; +} + + +Expr * +SymbolExpr::Optimize() { + if (symbol == NULL) + return NULL; + else if (symbol->constValue != NULL) { + assert(GetType()->IsConstType()); + return symbol->constValue; + } + else + return this; +} + + +void +SymbolExpr::Print() const { + if (symbol == NULL || GetType() == NULL) + return; + + printf("[%s] sym: (%s)", GetType()->GetString().c_str(), + symbol->name.c_str()); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// FunctionSymbolExpr + +FunctionSymbolExpr::FunctionSymbolExpr(std::vector *candidates, + SourcePos p) + : Expr(p) { + matchingFunc = NULL; + candidateFunctions = candidates; +} + + +const Type * +FunctionSymbolExpr::GetType() const { + return matchingFunc ? matchingFunc->type : NULL; +} + + +llvm::Value * +FunctionSymbolExpr::GetValue(FunctionEmitContext *ctx) const { + assert("!should not call FunctionSymbolExpr::GetValue()"); + return NULL; +} + + +Symbol * +FunctionSymbolExpr::GetBaseSymbol() const { + return matchingFunc; +} + + +Expr * +FunctionSymbolExpr::TypeCheck() { + return this; +} + + +Expr * +FunctionSymbolExpr::Optimize() { + return this; +} + + +void +FunctionSymbolExpr::Print() const { + if (!matchingFunc || !GetType()) + return; + + printf("[%s] fun sym (%s)", GetType()->GetString().c_str(), + matchingFunc->name.c_str()); + pos.Print(); +} + + +/////////////////////////////////////////////////////////////////////////// +// SyncExpr + +const Type * +SyncExpr::GetType() const { + return AtomicType::Void; +} + + +llvm::Value * +SyncExpr::GetValue(FunctionEmitContext *ctx) const { + ctx->SetDebugPos(pos); + std::vector noArg; + llvm::Function *fsync = m->module->getFunction("ISPCSync"); + if (fsync == NULL) { + FATAL("Couldn't find ISPCSync declaration?!"); + return NULL; + } + + return ctx->CallInst(fsync, noArg, ""); +} + + +void +SyncExpr::Print() const { + printf("sync"); + pos.Print(); +} + + +Expr * +SyncExpr::TypeCheck() { + return this; +} + + +Expr * +SyncExpr::Optimize() { + return this; +} diff --git a/expr.h b/expr.h new file mode 100644 index 00000000..ae59b101 --- /dev/null +++ b/expr.h @@ -0,0 +1,543 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file expr.h + @brief Expr abstract base class and expression implementations +*/ + +#ifndef ISPC_EXPR_H +#define ISPC_EXPR_H 1 + +#include "ispc.h" + +class FunctionSymbolExpr; + +/** @brief Expr is the abstract base class that defines the interface that + all expression types must implement. + */ +class Expr : public ASTNode { +public: + Expr(SourcePos p) : ASTNode(p) { } + + /** This is the main method for Expr implementations to implement. It + should call methods in the FunctionEmitContext to emit LLVM IR + instructions to the current basic block in order to generate an + llvm::Value that represents the expression's value. */ + virtual llvm::Value *GetValue(FunctionEmitContext *ctx) const = 0; + + /** For expressions that can provide an lvalue (e.g. array indexing), + this function should emit IR that computes the expression's lvalue + and returns the corresponding llvm::Value. Expressions that can't + provide an lvalue should leave this unimplemented; the default + implementation returns NULL. */ + virtual llvm::Value *GetLValue(FunctionEmitContext *ctx) const; + + /** Returns the Type of the expression. */ + virtual const Type *GetType() const = 0; + + /** For expressions that have values based on a symbol (e.g. regular + symbol references, array indexing, etc.), this returns a pointer to + that symbol. */ + virtual Symbol *GetBaseSymbol() const; + + /** If this is a constant expression that can be converted to a + constant of the given type, this method should return the + corresponding llvm::Constant value. Otherwise it should return + NULL. */ + virtual llvm::Constant *GetConstant(const Type *type) const; + + /** This method should perform early optimizations of the expression + (constant folding, etc.) and return a pointer to the resulting + expression. If an error is encountered during optimization, NULL + should be returned. */ + virtual Expr *Optimize() = 0; + + /** This method should perform type checking of the expression and + return a pointer to the resulting expression. If an error is + encountered, NULL should be returned. */ + virtual Expr *TypeCheck() = 0; + + /** Prints the expression to standard output (used for debugging). */ + virtual void Print() const = 0; + + /** This method tries to convert the expression to the given type. In + the event of failure, if the failureOk parameter is true, then no + error is issued. If failureOk is false, then an error is printed + that incorporates the given error message string. In either + failure case, NULL is returned. */ + Expr *TypeConv(const Type *type, const char *errorMsgBase = NULL, + bool failureOk = false); +}; + + +/** @brief Unary expression */ +class UnaryExpr : public Expr { +public: + enum Op { + PreInc, ///< Pre-increment + PreDec, ///< Pre-decrement + PostInc, ///< Post-increment + PostDec, ///< Post-decrement + Negate, ///< Negation + LogicalNot, ///< Logical not + BitNot, ///< Bit not + }; + + UnaryExpr(Op op, Expr *expr, SourcePos pos); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + Expr *Optimize(); + Expr *TypeCheck(); + +private: + const Op op; + Expr *expr; +}; + + +/** @brief Binary expression */ +class BinaryExpr : public Expr { +public: + enum Op { + Add, ///< Addition + Sub, ///< Subtraction + Mul, ///< Multiplication + Div, ///< Division + Mod, ///< Modulus + Shl, ///< Shift left + Shr, ///< Shift right + + Lt, ///< Less than + Gt, ///< Greater than + Le, ///< Less than or equal + Ge, ///< Greater than or equal + Equal, ///< Equal + NotEqual, ///< Not equal + + BitAnd, ///< Bitwise AND + BitXor, ///< Bitwise XOR + BitOr, ///< Bitwise OR + LogicalAnd, ///< Logical AND + LogicalOr, ///< Logical OR + + Comma, ///< Comma operator + }; + + BinaryExpr(Op o, Expr *a, Expr *b, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + + Expr *Optimize(); + Expr *TypeCheck(); + +private: + const Op op; + Expr *arg0, *arg1; +}; + + +/** @brief Assignment expression */ +class AssignExpr : public Expr { +public: + enum Op { + Assign, ///< Regular assignment + MulAssign, ///< *= assignment + DivAssign, ///< /= assignment + ModAssign, ///< %= assignment + AddAssign, ///< += assignment + SubAssign, ///< -= assignment + ShlAssign, ///< <<= assignment + ShrAssign, ///< >>= assignment + AndAssign, ///< &= assignment + XorAssign, ///< ^= assignment + OrAssign, ///< |= assignment + }; + + AssignExpr(Op o, Expr *a, Expr *b, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + + Expr *Optimize(); + Expr *TypeCheck(); + +private: + const Op op; + Expr *lvalue, *rvalue; +}; + + +/** @brief Selection expression, corresponding to "test ? a : b". + + Returns the value of "a" or "b", depending on the value of "test". +*/ +class SelectExpr : public Expr { +public: + SelectExpr(Expr *test, Expr *a, Expr *b, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + + Expr *Optimize(); + Expr *TypeCheck(); + +private: + Expr *test, *expr1, *expr2; +}; + + +/** @brief A list of expressions. + + These are mostly used for representing curly-brace delimited + initializers for initializers for complex types and for representing + the arguments passed to a function call. + */ +class ExprList : public Expr { +public: + ExprList(SourcePos p) : Expr(p) { } + ExprList(Expr *e, SourcePos p) : Expr(p) { exprs.push_back(e); } + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + llvm::Constant *GetConstant(const Type *type) const; + ExprList *Optimize(); + ExprList *TypeCheck(); + + std::vector exprs; +}; + + +/** @brief Expression representing a function call. + */ +class FunctionCallExpr : public Expr { +public: + FunctionCallExpr(Expr *func, ExprList *args, SourcePos p, bool isLaunch); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + + Expr *Optimize(); + Expr *TypeCheck(); + +private: + Expr *func; + ExprList *args; + bool isLaunch; + + void resolveFunctionOverloads(); + bool tryResolve(bool (*matchFunc)(Expr *, const Type *)); +}; + + +/** @brief Expression representing indexing into something with an integer + offset. + + This is used for both array indexing and indexing into VectorTypes. +*/ +class IndexExpr : public Expr { +public: + IndexExpr(Expr *arrayOrVector, Expr *index, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + llvm::Value *GetLValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + void Print() const; + + Expr *Optimize(); + Expr *TypeCheck(); + +private: + Expr *arrayOrVector, *index; +}; + + +/** @brief Expression representing member selection ("foo.bar"). + */ +class MemberExpr : public Expr { +public: + MemberExpr(Expr *expr, const char *identifier, SourcePos pos, + SourcePos identifierPos); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + llvm::Value *GetLValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + void Print() const; + Expr *Optimize(); + Expr *TypeCheck(); + +private: + std::string getCandidateNearMatches() const; + int getElementNumber() const; + + Expr *expr; + std::string identifier; + const SourcePos identifierPos; +}; + + +/** @brief Expression representing a compile-time constant value. + + This class can currently represent compile-time constants of anything + that is an AtomicType; for anything more complex, we don't currently + have a representation of a compile-time constant that can be further + reasoned about. + */ +class ConstExpr : public Expr { +public: + /** Create a ConstExpr from a uniform int32 value */ + ConstExpr(const Type *t, int32_t i, SourcePos p); + /** Create a ConstExpr from a varying int32 value */ + ConstExpr(const Type *t, int32_t *i, SourcePos p); + /** Create a ConstExpr from a uniform uint32 value */ + ConstExpr(const Type *t, uint32_t u, SourcePos p); + /** Create a ConstExpr from a varying uint32 value */ + ConstExpr(const Type *t, uint32_t *u, SourcePos p); + /** Create a ConstExpr from a uniform float value */ + ConstExpr(const Type *t, float f, SourcePos p); + /** Create a ConstExpr from a varying float value */ + ConstExpr(const Type *t, float *f, SourcePos p); + /** Create a ConstExpr from a uniform double value */ + ConstExpr(const Type *t, double d, SourcePos p); + /** Create a ConstExpr from a varying double value */ + ConstExpr(const Type *t, double *d, SourcePos p); + /** Create a ConstExpr from a uniform int64 value */ + ConstExpr(const Type *t, int64_t i, SourcePos p); + /** Create a ConstExpr from a varying int64 value */ + ConstExpr(const Type *t, int64_t *i, SourcePos p); + /** Create a ConstExpr from a uniform uint64 value */ + ConstExpr(const Type *t, uint64_t i, SourcePos p); + /** Create a ConstExpr from a varying uint64 value */ + ConstExpr(const Type *t, uint64_t *i, SourcePos p); + /** Create a ConstExpr from a uniform bool value */ + ConstExpr(const Type *t, bool b, SourcePos p); + /** Create a ConstExpr from a varying bool value */ + ConstExpr(const Type *t, bool *b, SourcePos p); + /** Create a ConstExpr of the same type as the given old ConstExpr, + with values given by the "vales" parameter. */ + ConstExpr(ConstExpr *old, double *values); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + llvm::Constant *GetConstant(const Type *type) const; + + Expr *TypeCheck(); + Expr *Optimize(); + + /** Return the ConstExpr's values as booleans, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsBool(bool *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as int32s, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsInt32(int32_t *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as uint32s, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsUInt32(uint32_t *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as floats, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsFloat(float *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as int64s, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsInt64(int64_t *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as uint64s, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsUInt64(uint64_t *, bool forceVarying = false) const; + + /** Return the ConstExpr's values as doubles, doing type conversion + from the actual type if needed. If forceVarying is true, then type + convert to 'varying' so as to always return a number of values + equal to the target vector width into the given pointer. */ + int AsDouble(double *, bool forceVarying = false) const; + + /** Return the number of values in the ConstExpr; should be either 1, + if it has uniform type, or the target's vector width if it's + varying. */ + int Count() const; + +private: + const AtomicType *type; + union { + int32_t int32Val[ISPC_MAX_NVEC]; + uint32_t uint32Val[ISPC_MAX_NVEC]; + bool boolVal[ISPC_MAX_NVEC]; + float floatVal[ISPC_MAX_NVEC]; + double doubleVal[ISPC_MAX_NVEC]; + int64_t int64Val[ISPC_MAX_NVEC]; + uint64_t uint64Val[ISPC_MAX_NVEC]; + }; +}; + + +/** @brief Expression representing a type cast of the given expression to a + probably-different type. */ +class TypeCastExpr : public Expr { +public: + TypeCastExpr(const Type *t, Expr *e, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + void Print() const; + Expr *TypeCheck(); + Expr *Optimize(); + +private: + const Type *type; + Expr *expr; +}; + + +/** @brief Expression that represents taking a reference of a (non-reference) + variable. */ +class ReferenceExpr : public Expr { +public: + ReferenceExpr(Expr *e, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + void Print() const; + Expr *TypeCheck(); + Expr *Optimize(); + +private: + Expr *expr; +}; + + +/** @brief Expression that represents dereferencing a reference to get its + value. */ +class DereferenceExpr : public Expr { +public: + DereferenceExpr(Expr *e, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + llvm::Value *GetLValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + void Print() const; + Expr *TypeCheck(); + Expr *Optimize(); + +private: + Expr *expr; +}; + + +/** @brief Expression representing a symbol reference in the program */ +class SymbolExpr : public Expr { +public: + SymbolExpr(Symbol *s, SourcePos p); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + llvm::Value *GetLValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + Expr *TypeCheck(); + Expr *Optimize(); + void Print() const; + +private: + Symbol *symbol; +}; + + +/** @brief Expression representing a function symbol in the program (generally + used for a function call). + */ +class FunctionSymbolExpr : public Expr { +public: + FunctionSymbolExpr(std::vector *candidateFunctions, + SourcePos pos); + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Symbol *GetBaseSymbol() const; + Expr *TypeCheck(); + Expr *Optimize(); + void Print() const; + +private: + friend class FunctionCallExpr; + + /** All of the functions with the name given in the function call; + there may be more then one, in which case we need to resolve which + overload is the best match. */ + std::vector *candidateFunctions; + + /** The actual matching function found after overload resolution; this + value is set by FunctionCallExpr::resolveFunctionOverloads() */ + Symbol *matchingFunc; +}; + + +/** @brief A sync statement in the program (waits for all launched tasks before + proceeding). */ +class SyncExpr : public Expr { +public: + SyncExpr(SourcePos p) : Expr(p) { } + + llvm::Value *GetValue(FunctionEmitContext *ctx) const; + const Type *GetType() const; + Expr *TypeCheck(); + Expr *Optimize(); + void Print() const; +}; + +#endif // ISPC_EXPR_H diff --git a/failing_tests/max-uint-1.ispc b/failing_tests/max-uint-1.ispc new file mode 100644 index 00000000..d86126e6 --- /dev/null +++ b/failing_tests/max-uint-1.ispc @@ -0,0 +1,19 @@ +static float float4(uniform float a, uniform float b, uniform float c, + uniform float d) { + float ret = 0; + for (uniform int i = 0; i < programCount; i += 4) { + ret = insert(ret, i + 0, a); + ret = insert(ret, i + 1, b); + ret = insert(ret, i + 2, c); + ret = insert(ret, i + 3, d); + } + return ret; +} + +export float f_f(float a) { + unsigned int i = (unsigned int)a; + return max((unsigned int)2, i); +} + +export float result() { return float4(2,2,3,4); } + diff --git a/failing_tests/max-uint.ispc b/failing_tests/max-uint.ispc new file mode 100644 index 00000000..145aa707 --- /dev/null +++ b/failing_tests/max-uint.ispc @@ -0,0 +1,8 @@ + +export float f_f(float a) { + unsigned int i = (unsigned int)a; + return max((unsigned int)10, i); +} + +export float result() { return 10; } + diff --git a/failing_tests/min-uint-1.ispc b/failing_tests/min-uint-1.ispc new file mode 100644 index 00000000..018b20d6 --- /dev/null +++ b/failing_tests/min-uint-1.ispc @@ -0,0 +1,19 @@ +static float float4(uniform float a, uniform float b, uniform float c, + uniform float d) { + float ret = 0; + for (uniform int i = 0; i < programCount; i += 4) { + ret = insert(ret, i + 0, a); + ret = insert(ret, i + 1, b); + ret = insert(ret, i + 2, c); + ret = insert(ret, i + 3, d); + } + return ret; +} + +export float f_f(float a) { + unsigned int i = (unsigned int)a; + return min((unsigned int)2, i); +} + +export float result() { return float4(1,2,2,2); } + diff --git a/failing_tests/min-uint-2.ispc b/failing_tests/min-uint-2.ispc new file mode 100644 index 00000000..5b5f0539 --- /dev/null +++ b/failing_tests/min-uint-2.ispc @@ -0,0 +1,19 @@ +static float float4(uniform float a, uniform float b, uniform float c, + uniform float d) { + float ret = 0; + for (uniform int i = 0; i < programCount; i += 4) { + ret = insert(ret, i + 0, a); + ret = insert(ret, i + 1, b); + ret = insert(ret, i + 2, c); + ret = insert(ret, i + 3, d); + } + return ret; +} + +export float f_f(float a) { + unsigned int i = (unsigned int)a; + return min((unsigned int)20, i); +} + +export float result() { return float4(1,2,3,4); } + diff --git a/failing_tests/struct-array-assign.ispc b/failing_tests/struct-array-assign.ispc new file mode 100644 index 00000000..8dc09543 --- /dev/null +++ b/failing_tests/struct-array-assign.ispc @@ -0,0 +1,11 @@ + +struct Foo { + float f; +}; + + +export float foo(Foo f[], int i, uniform int j) { + Foo x = f[i]; + return x.f; +} + diff --git a/ispc.cpp b/ispc.cpp new file mode 100644 index 00000000..506846f0 --- /dev/null +++ b/ispc.cpp @@ -0,0 +1,137 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file ispc.cpp + @brief ispc global definitions +*/ + +#include "ispc.h" +#include "module.h" +#include "util.h" +#include +#ifdef ISPC_IS_WINDOWS +#include +#include +#endif +#include +#include +#ifndef LLVM_2_8 +#include +#endif +#include +#include + +Globals *g; +Module *m; + +/////////////////////////////////////////////////////////////////////////// +// Target + +Target::Target() { + arch = "x86-64"; + cpu = "nehalem"; + isa = SSE4; + nativeVectorWidth = 4; + vectorWidth = 4; +} + +/////////////////////////////////////////////////////////////////////////// +// Opt + +Opt::Opt() { + level = 1; + fastMath = false; + disableBlendedMaskedStores = false; + disableCoherentControlFlow = false; + disableUniformControlFlow = false; + disableGatherScatterOptimizations = false; + disableMaskedStoreToStore = false; + disableGatherScatterFlattening = false; + disableUniformMemoryOptimizations = false; + disableMaskedStoreOptimizations = false; +} + +/////////////////////////////////////////////////////////////////////////// +// Globals + +Globals::Globals() { + mathLib = Globals::Math_ISPC; + + includeStdlib = true; + runCPP = true; + debugPrint = false; + disableWarnings = false; + emitPerfWarnings = true; + emitInstrumentation = false; + generateDebuggingSymbols = false; + + ctx = new llvm::LLVMContext; + +#ifdef ISPC_IS_WINDOWS + _getcwd(currentDirectory, sizeof(currentDirectory)); +#else + getcwd(currentDirectory, sizeof(currentDirectory)); +#endif +} + +/////////////////////////////////////////////////////////////////////////// +// ASTNode + +ASTNode::~ASTNode() { +} + +/////////////////////////////////////////////////////////////////////////// +// SourcePos + +SourcePos::SourcePos(const char *n, int l, int c) { + name = n ? n : m->module->getModuleIdentifier().c_str(); + first_line = last_line = l; + first_column = last_column = c; +} + +llvm::DIFile SourcePos::GetDIFile() const { +#ifdef LLVM_2_8 + return llvm::DIFile(); +#else + std::string directory, filename; + GetDirectoryAndFileName(g->currentDirectory, name, &directory, &filename); + return m->diBuilder->createFile(filename, directory); +#endif // LLVM_2_8 +} + + +void +SourcePos::Print() const { + printf(" @ [%s:%d.%d - %d.%d] ", name, first_line, first_column, + last_line, last_column); +} diff --git a/ispc.h b/ispc.h new file mode 100644 index 00000000..2c4ec158 --- /dev/null +++ b/ispc.h @@ -0,0 +1,313 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file ispc.h + @brief Main ispc.header file +*/ + +#ifndef ISPC_H +#define ISPC_H + +#if defined(_WIN32) || defined(_WIN64) +#define ISPC_IS_WINDOWS +#elif defined(__linux__) +#define ISPC_IS_LINUX +#elif defined(__APPLE__) +#define ISPC_IS_APPLE +#endif + +#include +#include +#include +#include + +/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation + targets. + */ +#define ISPC_MAX_NVEC 16 + +// Forward declarations of a number of widely-used LLVM types +namespace llvm { + class BasicBlock; + class Constant; + class ConstantValue; + class DIBuilder; + class DIDescriptor; + class DIFile; + class DIType; + class Function; + class FunctionType; + class LLVMContext; + class Module; + class Type; + class Value; +} + +class ArrayType; +class AtomicType; +class DeclSpecs; +class Declaration; +class Declarator; +class FunctionEmitContext; +class Expr; +class ExprList; +class FunctionType; +class GatherBuffer; +class Module; +class Stmt; +class Symbol; +class SymbolTable; +class Type; + +/** @brief Representation of a range of positions in a source file. + + This class represents a range of characters in a source file + (e.g. those that span a token's definition), from starting line and + column to ending line and column. (These values are tracked by the + lexing code). Both lines and columns are counted starting from one. + */ +struct SourcePos { + SourcePos(const char *n = NULL, int l = 0, int c = 0); + + const char *name; + int first_line; + int first_column; + int last_line; + int last_column; + + /** Prints the filename and line/column range to standard output. */ + void Print() const; + + /** Returns a LLVM DIFile object that represents the SourcePos's file */ + llvm::DIFile GetDIFile() const; +}; + + +/** @brief Abstract base class for nodes in the abstract syntax tree (AST). + + This class defines a basic interface that all abstract syntax tree + (AST) nodes must implement. The base classes for both expressions + (Expr) and statements (Stmt) inherit from this class. +*/ +class ASTNode { +public: + ASTNode(SourcePos p) : pos(p) { } + virtual ~ASTNode(); + + /** The Optimize() method should perform any appropriate early-stage + optimizations on the node (e.g. constant folding). The caller + should use the returned ASTNode * in place of the original node. + This method may return NULL if an error is encountered during + optimization. */ + virtual ASTNode *Optimize() = 0; + + /** Type checking should be performed by the node when this method is + called. In the event of an error, a NULL value may be returned. + As with ASTNode::Optimize(), the caller should store the returned + pointer in place of the original ASTNode *. */ + virtual ASTNode *TypeCheck() = 0; + + /** All AST nodes must track the file position where they are + defined. */ + const SourcePos pos; +}; + +/** @brief Structure that defines a compilation target + + This structure defines a compilation target for the ispc compiler. +*/ +struct Target { + Target(); + + /** Enumerant giving the instruction sets that the compiler can + target. */ + enum ISA { SSE2, SSE4, AVX }; + + /** Instruction set being compiled to. */ + ISA isa; + + /** Target system architecture. (e.g. "x86-64", "x86"). */ + std::string arch; + + /** Target CPU. (e.g. "corei7", "corei7-avx", ..) */ + std::string cpu; + + /** Native vector width of the vector instruction set. Note that this + value is directly derived from the ISA Being used (e.g. it's 4 for + SSE, 8 for AVX, etc.) */ + int nativeVectorWidth; + + /** Actual vector width currently being compiled to. This may be an + integer multiple of the native vector width, for example if we're + "doubling up" and compiling 8-wide on a 4-wide SSE system. */ + int vectorWidth; +}; + +/** @brief Structure that collects optimization options + + This structure collects all of the options related to optimization of + generated code. +*/ +struct Opt { + Opt(); + + /** Optimization level. Currently, the only valid values are 0, + indicating essentially no optimization, and 1, indicating as much + optimization as possible. */ + int level; + + /** Indicates whether "fast and loose" numerically unsafe optimizations + should be performed. This is false by default. */ + bool fastMath; + + /** On targets that don't have a masked store instruction but do have a + blending instruction, by default, we simulate masked stores by + loading the old value, blending, and storing the result. This can + potentially be unsafe in multi-threaded code, in that it writes to + locations that aren't supposed to be written to. Setting this + value to true disables this work-around, and instead implements + masked stores by 'scalarizing' them, so that we iterate over the + ISIMD lanes and do a scalar write for the ones that are running. */ + bool disableBlendedMaskedStores; + + /** Disables the 'coherent control flow' constructs in the + language. (e.g. this causes "cif" statements to be demoted to "if" + statements.) This is likely only useful for measuring the impact + of coherent control flow. */ + bool disableCoherentControlFlow; + + /** Disables uniform control flow optimizations (e.g. this changes an + "if" statement with a uniform condition to have a varying + condition). This is likely only useful for measuring the impact of + uniform control flow. */ + bool disableUniformControlFlow; + + /** Disables the backend optimizations related to gather/scatter + (e.g. transforming gather from sequential locations to an unaligned + load, etc.) This is likely only useful for measuring the impact of + these optimizations. */ + bool disableGatherScatterOptimizations; + + /** Disables the optimization that demotes masked stores to regular + stores when the store is happening at the same control flow level + where the variable was declared. This is likely only useful for + measuring the impact of this optimization. */ + bool disableMaskedStoreToStore; + + /** Disables the optimization that detects when the execution mask is + all on and emits code for gathers and scatters that doesn't loop + over the SIMD lanes but just does the scalar loads and stores + directly. */ + bool disableGatherScatterFlattening; + + /** Disables the optimizations that detect when arrays are being + indexed with 'uniform' values and issue scalar loads/stores rather + than gathers/scatters. This is likely only useful for measuring + the impact of this optimization. */ + bool disableUniformMemoryOptimizations; + + /** Disables optimizations for masked stores: masked stores with the + mask all on are transformed to regular stores, and masked stores + with the mask are all off are removed (which in turn can allow + eliminating additional dead code related to computing the value + stored). This is likely only useful for measuring the impact of + this optimization. */ + bool disableMaskedStoreOptimizations; +}; + +/** @brief This structure collects together a number of global variables. + + This structure collects a number of global variables that mostly + represent parameter settings for this compilation run. In particular, + none of these values should change after compilation befins; their + values are all set during command-line argument processing or very + early during the compiler's execution, before any files are parsed. + */ +struct Globals { + Globals(); + + /** Optimization option settings */ + Opt opt; + /** Compilation target information */ + Target target; + + /** There are a number of math libraries that can be used for + transcendentals and the like during program compilation. */ + enum MathLib { Math_ISPC, Math_ISPCFast, Math_SVML, Math_System }; + MathLib mathLib; + + /** Records whether the ispc standard library should be made available + to the program during compilations. (Default is true.) */ + bool includeStdlib; + + /** Indicates whether the C pre-processor should be run over the + program source before compiling it. (Default is true.) */ + bool runCPP; + + /** When \c true, voluminous debugging output will be printed during + ispc's execution. */ + bool debugPrint; + + /** Indicates whether all warning messages should be surpressed. */ + bool disableWarnings; + + /** Indicates whether additional warnings should be issued about + possible performance pitfalls. */ + bool emitPerfWarnings; + + /** Indicates whether calls should be emitted in the program to an + externally-defined program instrumentation function. (See the + "Instrumenting your ispc programs" section in the user's + manual.) */ + bool emitInstrumentation; + + /** Indicates whether ispc should generate debugging symbols for the + program in its output. */ + bool generateDebuggingSymbols; + + /** Global LLVMContext object */ + llvm::LLVMContext *ctx; + + /** Current working directory when the ispc compiler starts + execution. */ + char currentDirectory[1024]; + + /** Arguments to pass along to the C pre-processor, if it is run on the + program before compilation. */ + std::vector cppArgs; +}; + +extern Globals *g; +extern Module *m; + +#endif // ISPC_H diff --git a/ispc.sln b/ispc.sln new file mode 100755 index 00000000..0f48203a --- /dev/null +++ b/ispc.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 11.00 +# Visual Studio 2010 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc", "ispc.vcxproj", "{9861F490-F516-480C-B63C-D62A77AFA9D5}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ispc_test", "ispc_test.vcxproj", "{92547BA8-BE86-4E78-8799-1D72A70E5831}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Release|Win32 = Release|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.ActiveCfg = Debug|Win32 + {9861F490-F516-480C-B63C-D62A77AFA9D5}.Debug|Win32.Build.0 = Debug|Win32 + {9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.ActiveCfg = Release|Win32 + {9861F490-F516-480C-B63C-D62A77AFA9D5}.Release|Win32.Build.0 = Release|Win32 + {92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.ActiveCfg = Debug|Win32 + {92547BA8-BE86-4E78-8799-1D72A70E5831}.Debug|Win32.Build.0 = Debug|Win32 + {92547BA8-BE86-4E78-8799-1D72A70E5831}.Release|Win32.ActiveCfg = Release|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/ispc.vcxproj b/ispc.vcxproj new file mode 100755 index 00000000..e06e1eff --- /dev/null +++ b/ispc.vcxproj @@ -0,0 +1,216 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + + + + + + + + + + + + + + + + + + + %LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp + clang stdlib-c.c + %LLVM_INSTALL_DIR%\bin\clang -emit-llvm stdlib-c.c -c -o - | %LLVM_INSTALL_DIR%\bin\llvm-dis - | python bitcode2cpp.py stdlib-c.c > gen-bitcode-c.cpp + clang stdlib-c.c + gen-bitcode-c.cpp + gen-bitcode-c.cpp + + + + + + + + + + + + + + + + + + + + + + + + Document + cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp + gen-stdlib.cpp + cl /EP /TP %(Filename).ispc /DISPC=1 /DPI=3.1415926535 | python stdlib2cpp.py > gen-stdlib.cpp + gen-stdlib.cpp + Building gen-stdlib.cpp + Building gen-stdlib.cpp + + + + + Document + m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + stdlib.m4;stdlib-sse.ll + m4 stdlib.m4 stdlib-sse4.ll | python bitcode2cpp.py stdlib-sse4.ll > gen-bitcode-sse4.cpp + gen-bitcode-sse4.cpp + stdlib.m4;stdlib-sse.ll + Building gen-bitcode-sse4.cpp + Building gen-bitcode-sse4.cpp + + + + + Document + m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp + gen-bitcode-sse4x2.cpp + stdlib.m4;stdlib-sse.ll + m4 stdlib.m4 stdlib-sse4x2.ll | python bitcode2cpp.py stdlib-sse4x2.ll > gen-bitcode-sse4x2.cpp + gen-bitcode-sse4x2.cpp + stdlib.m4;stdlib-sse.ll + Building gen-bitcode-sse4x2.cpp + Building gen-bitcode-sse4x2.cpp + + + + + Document + m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp + gen-bitcode-sse2.cpp + stdlib.m4;stdlib-sse.ll + m4 stdlib.m4 stdlib-sse2.ll | python bitcode2cpp.py stdlib-sse2.ll > gen-bitcode-sse2.cpp + gen-bitcode-sse2.cpp + stdlib.m4;stdlib-sse.ll + Building gen-bitcode-sse2.cpp + Building gen-bitcode-sse2.cpp + + + + + Document + m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp + gen-bitcode-avx.cpp + stdlib.m4;stdlib-sse.ll + m4 stdlib.m4 stdlib-avx.ll | python bitcode2cpp.py stdlib-avx.ll > gen-bitcode-avx.cpp + gen-bitcode-avx.cpp + stdlib.m4;stdlib-sse.ll + Building gen-bitcode-avx.cpp + Building gen-bitcode-avx.cpp + + + + + Document + flex -t lex.ll > lex.cc + lex.cc + flex -t lex.ll > lex.cc + lex.cc + ispc.h;decl.h;parse.hh;sym.h + ispc.h;decl.h;parse.hh;sym.h + + + Document + bison -d -v -t -o parse.cc parse.yy + parse.cc;parse.h + bison -d -v -t -o parse.cc parse.yy + parse.cc;parse.h + ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h + ispc.h;type.h;decl.h;expr.h;sym.h;stmt.h + Running bison on parse.yy + Running bison on parse.yy + + + + {9861F490-F516-480C-B63C-D62A77AFA9D5} + Win32Proj + ispc + + + + Application + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + true + + + false + + + + NotUsing + Level3 + Disabled + NOMINMAX + $(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories) + 4146;4800;4996;4355;4624 + + + Console + true + $(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories) + LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies) + + + + + Level3 + NotUsing + MaxSpeed + true + true + NOMINMAX + $(LLVM_INSTALL_DIR)\include;.;.\winstuff;%(AdditionalIncludeDirectories) + 4146;4800;4996;4355;4624 + + + Console + true + true + true + $(LLVM_INSTALL_DIR)\lib;%(AdditionalLibraryDirectories) + LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies) + + + + + + \ No newline at end of file diff --git a/ispc_test.cpp b/ispc_test.cpp new file mode 100644 index 00000000..3665aa42 --- /dev/null +++ b/ispc_test.cpp @@ -0,0 +1,313 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#define _CRT_SECURE_NO_WARNINGS + +#include +#include + +#ifdef ISPC_HAVE_SVML +#include +extern "C" { + extern __m128 __svml_sinf4(__m128); + extern __m128 __svml_cosf4(__m128); + extern __m128 __svml_sincosf4(__m128 *,__m128); + extern __m128 __svml_tanf4(__m128); + extern __m128 __svml_atanf4(__m128); + extern __m128 __svml_atan2f4(__m128, __m128); + extern __m128 __svml_expf4(__m128); + extern __m128 __svml_logf4(__m128); + extern __m128 __svml_powf4(__m128, __m128); +} +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef LLVM_2_8 +#include +#endif + +extern "C" { + void ISPCLaunch(void *, void *); + void ISPCSync(); +} + +void ISPCLaunch(void *func, void *data) { + typedef void (*TaskFuncType)(void *, int, int); + TaskFuncType tft = (TaskFuncType)(func); + tft(data, 0, 1); +} + + +void ISPCSync() { +} + +static void usage(int ret) { + fprintf(stderr, "usage: ispc_test\n"); + fprintf(stderr, "\t[-h/--help]\tprint help\n"); + fprintf(stderr, "\t\n"); + exit(ret); +} + +static void svml_missing() { + fprintf(stderr, "Program called unavailable SVML function!\n"); + exit(1); +} + +static bool lRunTest(const char *fn) { + llvm::LLVMContext *ctx = new llvm::LLVMContext; + +#ifdef LLVM_2_8 + std::string err; + llvm::MemoryBuffer *buf = llvm::MemoryBuffer::getFileOrSTDIN(fn, &err); + if (!buf) { + fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.c_str()); + delete ctx; + return false; + } + std::string bcErr; + llvm::Module *module = llvm::ParseBitcodeFile(buf, *ctx, &bcErr); +#else + llvm::OwningPtr buf; + llvm::error_code err = llvm::MemoryBuffer::getFileOrSTDIN(fn, buf); + if (err) { + fprintf(stderr, "Unable to open file \"%s\": %s\n", fn, err.message().c_str()); + delete ctx; + return false; + } + std::string bcErr; + llvm::Module *module = llvm::ParseBitcodeFile(buf.get(), *ctx, &bcErr); +#endif + + if (!module) { + fprintf(stderr, "Bitcode reader failed for \"%s\": %s\n", fn, bcErr.c_str()); + delete ctx; + return false; + } + + std::string eeError; + llvm::ExecutionEngine *ee = llvm::ExecutionEngine::createJIT(module, &eeError); + if (!ee) { + fprintf(stderr, "Unable to create ExecutionEngine: %s\n", eeError.c_str()); + return false; + } + + llvm::Function *func; + if ((func = module->getFunction("ISPCLaunch")) != NULL) + ee->addGlobalMapping(func, (void *)ISPCLaunch); + if ((func = module->getFunction("ISPCSync")) != NULL) + ee->addGlobalMapping(func, (void *)ISPCSync); + if ((func = module->getFunction("putchar")) != NULL) + ee->addGlobalMapping(func, (void *)putchar); + if ((func = module->getFunction("printf")) != NULL) + ee->addGlobalMapping(func, (void *)printf); + if ((func = module->getFunction("fflush")) != NULL) + ee->addGlobalMapping(func, (void *)fflush); + if ((func = module->getFunction("sinf")) != NULL) + ee->addGlobalMapping(func, (void *)sinf); + if ((func = module->getFunction("cosf")) != NULL) + ee->addGlobalMapping(func, (void *)cosf); + if ((func = module->getFunction("tanf")) != NULL) + ee->addGlobalMapping(func, (void *)tanf); + if ((func = module->getFunction("atanf")) != NULL) + ee->addGlobalMapping(func, (void *)atanf); + if ((func = module->getFunction("atan2f")) != NULL) + ee->addGlobalMapping(func, (void *)atan2f); + if ((func = module->getFunction("powf")) != NULL) + ee->addGlobalMapping(func, (void *)powf); + if ((func = module->getFunction("expf")) != NULL) + ee->addGlobalMapping(func, (void *)expf); + if ((func = module->getFunction("logf")) != NULL) + ee->addGlobalMapping(func, (void *)logf); + +#ifdef ISPC_HAVE_SVML +#define DO_SVML(FUNC ,FUNCNAME) \ + if ((func = module->getFunction(FUNCNAME)) != NULL) \ + ee->addGlobalMapping(func, (void *)FUNC) +#else +#define DO_SVML(FUNC, FUNCNAME) \ + if ((func = module->getFunction(FUNCNAME)) != NULL) \ + ee->addGlobalMapping(func, (void *)svml_missing) +#endif + + DO_SVML(__svml_sinf4, "__svml_sinf4"); + DO_SVML(__svml_cosf4, "__svml_cosf4"); + DO_SVML(__svml_sincosf4, "__svml_sincosf4"); + DO_SVML(__svml_tanf4, "__svml_tanf4"); + DO_SVML(__svml_atanf4, "__svml_atanf4"); + DO_SVML(__svml_atan2f4, "__svml_atan2f4"); + DO_SVML(__svml_expf4, "__svml_expf4"); + DO_SVML(__svml_logf4, "__svml_logf4"); + DO_SVML(__svml_powf4, "__svml_powf4"); + + // figure out the vector width in the compiled code + func = module->getFunction("width"); + if (!func) { + fprintf(stderr, "No width() function found!\n"); + return false; + } + int width; + { + typedef int (*PFN)(); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + width = pfn(); + assert(width == 4 || width == 8 || width == 12 || width == 16); + } + + // find the value that returns the desired result + func = module->getFunction("result"); + bool foundResult = (func != NULL); + float result[16]; + for (int i = 0; i < 16; ++i) + result[i] = 0; + bool ok = true; + if (foundResult) { + typedef void (*PFN)(float *); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(result); + } + else + fprintf(stderr, "Warning: no result() function found.\n"); + + // try to find a function to run + float returned[16]; + for (int i = 0; i < 16; ++i) + returned[i] = 0; + float vfloat[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + double vdouble[16] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}; + int vint[16] = { 2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 }; + int vint2[16] = { 5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20}; + + if ((func = module->getFunction("f_v")) != NULL) { + typedef void (*PFN)(float *); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(returned); + } + else if ((func = module->getFunction("f_f")) != NULL) { + typedef void (*PFN)(float *, float *); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + llvm::verifyFunction(*func); + pfn(returned, vfloat); + } + else if ((func = module->getFunction("f_fu")) != NULL) { + typedef void (*PFN)(float *, float *, float fu); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + llvm::verifyFunction(*func); + pfn(returned, vfloat, 5.); + } + else if ((func = module->getFunction("f_fi")) != NULL) { + typedef void (*PFN)(float *, float *, int *); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(returned, vfloat, vint); + } + else if ((func = module->getFunction("f_du")) != NULL) { + typedef void (*PFN)(float *, double *, double); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(returned, vdouble, 5.); + } + else if ((func = module->getFunction("f_duf")) != NULL) { + typedef void (*PFN)(float *, double *, float); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(returned, vdouble, 5.f); + } + else if ((func = module->getFunction("f_di")) != NULL) { + typedef void (*PFN)(float *, double *, int *); + PFN pfn = reinterpret_cast(ee->getPointerToFunction(func)); + pfn(returned, vdouble, vint2); + } + else { + fprintf(stderr, "Unable to find runnable function in file \"%s\"\n", fn); + ok = false; + } + + // see if we got the right result + if (ok) { + if (foundResult) { + for (int i = 0; i < width; ++i) + if (returned[i] != result[i]) { + ok = false; + fprintf(stderr, "Test \"%s\" RETURNED %d: %g / %a EXPECTED %g / %a\n", + fn, i, returned[i], returned[i], result[i], result[i]); + } + } + else { + for (int i = 0; i < width; ++i) + fprintf(stderr, "Test \"%s\" returned %d: %g / %a\n", + fn, i, returned[i], returned[i]); + } + } + + delete ee; + delete ctx; + + return ok && foundResult; +} + +int main(int argc, char *argv[]) { + llvm::InitializeNativeTarget(); + + std::vector files; + for (int i = 1; i < argc; ++i) { + if (!strcmp(argv[i], "--help") || !strcmp(argv[i], "-h")) + usage(0); + else + files.push_back(argv[i]); + } + + int passes = 0, fails = 0; + for (unsigned int i = 0; i < files.size(); ++i) { + if (lRunTest(files[i])) ++passes; + else ++fails; + } + + if (fails > 0) + fprintf(stderr, "%d/%d tests passed\n", passes, passes+fails); + return fails > 0; +} diff --git a/ispc_test.vcxproj b/ispc_test.vcxproj new file mode 100755 index 00000000..bd7a6407 --- /dev/null +++ b/ispc_test.vcxproj @@ -0,0 +1,88 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + + + + + {92547BA8-BE86-4E78-8799-1D72A70E5831} + Win32Proj + ispc_test + + + + Application + true + Unicode + + + Application + false + true + Unicode + + + + + + + + + + + + + true + + + false + + + + + + Level3 + Disabled + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(LLVM_INSTALL_DIR)/include + + + Console + true + $(LLVM_INSTALL_DIR)/lib + LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies) + + + + + Level3 + + + MaxSpeed + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + $(LLVM_INSTALL_DIR)/include + + + Console + true + true + true + $(LLVM_INSTALL_DIR)/lib + LLVMAnalysis.lib;LLVMArchive.lib;LLVMAsmPrinter.lib;LLVMBitReader.lib;LLVMBitWriter.lib;LLVMCodeGen.lib;LLVMCore.lib;LLVMExecutionEngine.lib;LLVMInstCombine.lib;LLVMInstrumentation.lib;LLVMipa.lib;LLVMipo.lib;LLVMJIT.lib;LLVMLinker.lib;LLVMMC.lib;LLVMMCParser.lib;LLVMObject.lib;LLVMScalarOpts.lib;LLVMSelectionDAG.lib;LLVMSupport.lib;LLVMTarget.lib;LLVMTransformUtils.lib;LLVMX86ASMPrinter.lib;LLVMX86ASMParser.lib;LLVMX86Utils.lib;LLVMX86CodeGen.lib;LLVMX86Disassembler.lib;LLVMX86Info.lib;%(AdditionalDependencies) + + + + + + \ No newline at end of file diff --git a/lex.ll b/lex.ll new file mode 100644 index 00000000..327ac144 --- /dev/null +++ b/lex.ll @@ -0,0 +1,426 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +%{ + +#include "ispc.h" +#include "decl.h" +#include "parse.hh" +#include "sym.h" +#include "util.h" +#include "module.h" + +static uint32_t lParseBinary(const char *ptr, SourcePos pos); +static void lCComment(SourcePos *); +static void lCppComment(SourcePos *); +static void lHandleCppHash(SourcePos *); +static void lStringConst(YYSTYPE *, SourcePos *); + +#define YY_USER_ACTION \ + yylloc->first_line = yylloc->last_line; \ + yylloc->first_column = yylloc->last_column; \ + yylloc->last_column += yyleng; + +#ifdef ISPC_IS_WINDOWS +inline int isatty(int) { return 0; } +#endif // ISPC_IS_WINDOWS + +%} + +%option nounput +%option noyywrap +%option bison-bridge +%option bison-locations +%option nounistd + +WHITESPACE [ \t\r]+ +INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+)) +FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)|([-]?0x[01]\.?[0-9a-fA-F]+p[-+]?[0-9]+[fF]?) + +IDENT [a-zA-Z_][a-zA-Z_0-9]* + +%% +"/*" { lCComment(yylloc); } +"//" { lCppComment(yylloc); } + +bool { return TOKEN_BOOL; } +break { return TOKEN_BREAK; } +case { return TOKEN_CASE; } +cbreak { return TOKEN_CBREAK; } +ccontinue { return TOKEN_CCONTINUE; } +cdo { return TOKEN_CDO; } +cfor { return TOKEN_CFOR; } +char { return TOKEN_CHAR; } +cif { return TOKEN_CIF; } +cwhile { return TOKEN_CWHILE; } +const { return TOKEN_CONST; } +continue { return TOKEN_CONTINUE; } +creturn { return TOKEN_CRETURN; } +default { return TOKEN_DEFAULT; } +do { return TOKEN_DO; } +double { return TOKEN_DOUBLE; } +else { return TOKEN_ELSE; } +enum { return TOKEN_ENUM; } +export { return TOKEN_EXPORT; } +extern { return TOKEN_EXTERN; } +false { return TOKEN_FALSE; } +float { return TOKEN_FLOAT; } +for { return TOKEN_FOR; } +goto { return TOKEN_GOTO; } +if { return TOKEN_IF; } +inline { return TOKEN_INLINE; } +int { return TOKEN_INT; } +int32 { return TOKEN_INT; } +int64 { return TOKEN_INT64; } +launch { return TOKEN_LAUNCH; } +print { return TOKEN_PRINT; } +reference { return TOKEN_REFERENCE; } +return { return TOKEN_RETURN; } +soa { return TOKEN_SOA; } +static { return TOKEN_STATIC; } +struct { return TOKEN_STRUCT; } +switch { return TOKEN_SWITCH; } +sync { return TOKEN_SYNC; } +task { return TOKEN_TASK; } +true { return TOKEN_TRUE; } +typedef { return TOKEN_TYPEDEF; } +uniform { return TOKEN_UNIFORM; } +unsigned { return TOKEN_UNSIGNED; } +varying { return TOKEN_VARYING; } +void { return TOKEN_VOID; } +while { return TOKEN_WHILE; } + +L?\"(\\.|[^\\"])*\" { lStringConst(yylval, yylloc); return TOKEN_STRING_LITERAL; } + +{IDENT} { + /* We have an identifier--is it a type name or an identifier? + The symbol table will straighten us out... */ + yylval->stringVal = new std::string(yytext); + if (m->symbolTable->LookupType(yytext) != NULL) + return TOKEN_TYPE_NAME; + else + return TOKEN_IDENTIFIER; +} + +{INT_NUMBER} { + char *endPtr = NULL; +#ifdef ISPC_IS_WINDOWS + unsigned long val; +#else + unsigned long long val; +#endif + + if (yytext[0] == '0' && yytext[1] == 'b') + val = lParseBinary(yytext+2, *yylloc); + else { +#ifdef ISPC_IS_WINDOWS + val = strtoul(yytext, &endPtr, 0); +#else + val = strtoull(yytext, &endPtr, 0); +#endif + } + yylval->int32Val = (int32_t)val; + if (val != (unsigned int)yylval->int32Val) + Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)", + yytext, yylval->int32Val, (unsigned long long)val); + return TOKEN_INT_CONSTANT; +} + +{INT_NUMBER}[uU] { + char *endPtr = NULL; +#ifdef ISPC_IS_WINDOWS + unsigned long val; +#else + unsigned long long val; +#endif + + if (yytext[0] == '0' && yytext[1] == 'b') + val = lParseBinary(yytext+2, *yylloc); + else { +#ifdef ISPC_IS_WINDOWS + val = strtoul(yytext, &endPtr, 0); +#else + val = strtoull(yytext, &endPtr, 0); +#endif + } + + yylval->int32Val = (int32_t)val; + if (val != (unsigned int)yylval->int32Val) + Warning(*yylloc, "32-bit integer has insufficient bits to represent value %s (%x %llx)", + yytext, yylval->int32Val, (unsigned long long)val); + return TOKEN_UINT_CONSTANT; +} + +{FLOAT_NUMBER} { + /* FIXME: need to implement a hex float constant parser so that we can + support them on Windows (which doesn't handle them in its atof() + implementation... */ + yylval->floatVal = atof(yytext); + return TOKEN_FLOAT_CONSTANT; +} + +"++" { return TOKEN_INC_OP; } +"--" { return TOKEN_DEC_OP; } +"<<" { return TOKEN_LEFT_OP; } +">>" { return TOKEN_RIGHT_OP; } +"<=" { return TOKEN_LE_OP; } +">=" { return TOKEN_GE_OP; } +"==" { return TOKEN_EQ_OP; } +"!=" { return TOKEN_NE_OP; } +"&&" { return TOKEN_AND_OP; } +"||" { return TOKEN_OR_OP; } +"*=" { return TOKEN_MUL_ASSIGN; } +"/=" { return TOKEN_DIV_ASSIGN; } +"%=" { return TOKEN_MOD_ASSIGN; } +"+=" { return TOKEN_ADD_ASSIGN; } +"-=" { return TOKEN_SUB_ASSIGN; } +"<<=" { return TOKEN_LEFT_ASSIGN; } +">>=" { return TOKEN_RIGHT_ASSIGN; } +"&=" { return TOKEN_AND_ASSIGN; } +"^=" { return TOKEN_XOR_ASSIGN; } +"|=" { return TOKEN_OR_ASSIGN; } +";" { return ';'; } +("{"|"<%") { return '{'; } +("}"|"%>") { return '}'; } +"," { return ','; } +":" { return ':'; } +"=" { return '='; } +"(" { return '('; } +")" { return ')'; } +("["|"<:") { return '['; } +("]"|":>") { return ']'; } +"." { return '.'; } +"&" { return '&'; } +"!" { return '!'; } +"~" { return '~'; } +"-" { return '-'; } +"+" { return '+'; } +"*" { return '*'; } +"/" { return '/'; } +"%" { return '%'; } +"<" { return '<'; } +">" { return '>'; } +"^" { return '^'; } +"|" { return '|'; } +"?" { return '?'; } + +{WHITESPACE} { } + +\n { + yylloc->last_line++; + yylloc->last_column = 1; +} + +#(line)?[ ][0-9]+[ ]\"(\\.|[^\\"])*\"[^\n]* { + lHandleCppHash(yylloc); +} + +. { + Error(*yylloc, "Illegal character: %c (0x%x)", yytext[0], int(yytext[0])); + YY_USER_ACTION +} + +%% + +/*sizeof { return TOKEN_SIZEOF; }*/ +/*"->" { return TOKEN_PTR_OP; }*/ +/*short { return TOKEN_SHORT; }*/ +/*long { return TOKEN_LONG; }*/ +/*signed { return TOKEN_SIGNED; }*/ +/*volatile { return TOKEN_VOLATILE; }*/ +/*"long"[ \t\v\f\n]+"long" { return TOKEN_LONGLONG; }*/ +/*union { return TOKEN_UNION; }*/ +/*"..." { return TOKEN_ELLIPSIS; }*/ + +/** Return the integer version of a binary constant from a string. + */ +static uint32_t +lParseBinary(const char *ptr, SourcePos pos) { + uint32_t val = 0; + bool warned = false; + + while (*ptr != '\0') { + /* if this hits, the regexp for 0b... constants is broken */ + assert(*ptr == '0' || *ptr == '1'); + + if ((val & (1<<31)) && warned == false) { + // We're about to shift out a set bit + // FIXME: 64-bit int constants... + Warning(pos, "Can't represent binary constant with 32-bit integer type"); + warned = true; + } + + val = (val << 1) | (*ptr == '0' ? 0 : 1); + ++ptr; + } + return val; +} + + +/** Handle a C-style comment in the source. + */ +static void +lCComment(SourcePos *pos) { + char c, prev = 0; + + while ((c = yyinput()) != 0) { + if (c == '\n') { + pos->last_line++; + pos->last_column = 1; + } + if (c == '/' && prev == '*') + return; + prev = c; + } + Error(*pos, "unterminated comment"); +} + +/** Handle a C++-style comment--eat everything up until the end of the line. + */ +static void +lCppComment(SourcePos *pos) { + char c; + do { + c = yyinput(); + } while (c != 0 && c != '\n'); + if (c == '\n') { + pos->last_line++; + pos->last_column = 1; + } +} + +/** Handle a line that starts with a # character; this should be something + left behind by the preprocessor indicating the source file/line + that our current position corresponds to. + */ +static void lHandleCppHash(SourcePos *pos) { + char *ptr, *src; + + // Advance past the opening stuff on the line. + assert(yytext[0] == '#'); + if (yytext[1] == ' ') + // On Linux/OSX, the preprocessor gives us lines like + // # 1234 "foo.c" + ptr = yytext + 2; + else { + // On windows, cl.exe's preprocessor gives us lines of the form: + // #line 1234 "foo.c" + assert(!strncmp(yytext+1, "line ", 5)); + ptr = yytext + 6; + } + + // Now we can set the line number based on the integer in the string + // that ptr is pointing at. + pos->last_line = strtol(ptr, &src, 10) - 1; + pos->last_column = 1; + // Make sure that the character after the integer is a space and that + // then we have open quotes + assert(src != ptr && src[0] == ' ' && src[1] == '"'); + src += 2; + + // And the filename is everything up until the closing quotes + std::string filename; + while (*src != '"') { + assert(*src && *src != '\n'); + filename.push_back(*src); + ++src; + } + pos->name = strdup(filename.c_str()); +} + + +/** Given a pointer to a position in a string, return the character that it + represents, accounting for the escape characters supported in string + constants. (i.e. given the literal string "\\", return the character + '/'). The return value is the new position in the string and the + decoded character is returned in *pChar. +*/ +static char * +lEscapeChar(char *str, char *pChar, SourcePos *pos) +{ + if (*str != '\\') { + *pChar = *str; + } + else { + char *tail; + ++str; + switch (*str) { + case '\'': *pChar = '\''; break; + case '\"': *pChar = '\"'; break; + case '?': *pChar = '\?'; break; + case '\\': *pChar = '\\'; break; + case 'a': *pChar = '\a'; break; + case 'b': *pChar = '\b'; break; + case 'f': *pChar = '\f'; break; + case 'n': *pChar = '\n'; break; + case 'r': *pChar = '\r'; break; + case 't': *pChar = '\t'; break; + case 'v': *pChar = '\v'; break; + // octal constants \012 + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + *pChar = strtol(str, &tail, 8); + str = tail - 1; + break; + // hexidecimal constant \xff + case 'x': + *pChar = strtol(str, &tail, 16); + str = tail - 1; + break; + default: + Error(*pos, "Bad character escape sequence: '%s'\n.", str); + break; + } + } + ++str; + return str; +} + + +/** Parse a string constant in the source file. For each character in the + string, handle any escaped characters with lEscapeChar() and keep eating + characters until we come to the closing quote. +*/ +static void +lStringConst(YYSTYPE *yylval, SourcePos *pos) +{ + char *p; + std::string str; + p = strchr(yytext, '"') + 1; + while (*p != '\"') { + char cval; + p = lEscapeChar(p, &cval, pos); + str.push_back(cval); + } + yylval->stringVal = new std::string(str); +} diff --git a/llvmutil.cpp b/llvmutil.cpp new file mode 100644 index 00000000..e0fc4511 --- /dev/null +++ b/llvmutil.cpp @@ -0,0 +1,329 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file llvmutil.cpp + @brief Implementations of various LLVM utility types and classes. +*/ + +#include "llvmutil.h" +#include "type.h" + +const llvm::Type *LLVMTypes::VoidType = NULL; +const llvm::PointerType *LLVMTypes::VoidPointerType = NULL; +const llvm::Type *LLVMTypes::BoolType = NULL; +const llvm::Type *LLVMTypes::Int8Type = NULL; +const llvm::Type *LLVMTypes::Int16Type = NULL; +const llvm::Type *LLVMTypes::Int32Type = NULL; +const llvm::Type *LLVMTypes::Int32PointerType = NULL; +const llvm::Type *LLVMTypes::Int64Type = NULL; +const llvm::Type *LLVMTypes::Int64PointerType = NULL; +const llvm::Type *LLVMTypes::FloatType = NULL; +const llvm::Type *LLVMTypes::FloatPointerType = NULL; +const llvm::Type *LLVMTypes::DoubleType = NULL; + +const llvm::VectorType *LLVMTypes::MaskType = NULL; +const llvm::VectorType *LLVMTypes::BoolVectorType = NULL; +const llvm::VectorType *LLVMTypes::Int1VectorType = NULL; +const llvm::VectorType *LLVMTypes::Int32VectorType = NULL; +const llvm::Type *LLVMTypes::Int32VectorPointerType = NULL; +const llvm::VectorType *LLVMTypes::Int64VectorType = NULL; +const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL; +const llvm::VectorType *LLVMTypes::FloatVectorType = NULL; +const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL; +const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL; +const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL; + +llvm::Constant *LLVMTrue = NULL; +llvm::Constant *LLVMFalse = NULL; +llvm::Constant *LLVMMaskAllOn = NULL; +llvm::Constant *LLVMMaskAllOff = NULL; + + +void +InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { + LLVMTypes::VoidType = llvm::Type::getVoidTy(*ctx); + LLVMTypes::VoidPointerType = llvm::PointerType::get(llvm::Type::getInt8Ty(*ctx), 0); + LLVMTypes::BoolType = llvm::Type::getInt1Ty(*ctx); + LLVMTypes::Int8Type = llvm::Type::getInt8Ty(*ctx); + LLVMTypes::Int16Type = llvm::Type::getInt16Ty(*ctx); + LLVMTypes::Int32Type = llvm::Type::getInt32Ty(*ctx); + LLVMTypes::Int32PointerType = llvm::PointerType::get(LLVMTypes::Int32Type, 0); + LLVMTypes::Int64Type = llvm::Type::getInt64Ty(*ctx); + LLVMTypes::Int64PointerType = llvm::PointerType::get(LLVMTypes::Int64Type, 0); + LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx); + LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); + LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx); + + // Note that both the mask and bool vectors are vector of int32s + // (not i1s). LLVM ends up generating much better SSE code with + // this representation. + LLVMTypes::MaskType = LLVMTypes::BoolVectorType = + llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.vectorWidth); + + LLVMTypes::Int1VectorType = + llvm::VectorType::get(llvm::Type::getInt1Ty(*ctx), target.vectorWidth); + LLVMTypes::Int32VectorType = + llvm::VectorType::get(LLVMTypes::Int32Type, target.vectorWidth); + LLVMTypes::Int32VectorPointerType = llvm::PointerType::get(LLVMTypes::Int32VectorType, 0); + LLVMTypes::Int64VectorType = + llvm::VectorType::get(LLVMTypes::Int64Type, target.vectorWidth); + LLVMTypes::Int64VectorPointerType = llvm::PointerType::get(LLVMTypes::Int64VectorType, 0); + LLVMTypes::FloatVectorType = + llvm::VectorType::get(LLVMTypes::FloatType, target.vectorWidth); + LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0); + LLVMTypes::DoubleVectorType = + llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth); + LLVMTypes::VoidPointerVectorType = + llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth); + + LLVMTrue = llvm::ConstantInt::getTrue(*ctx); + LLVMFalse = llvm::ConstantInt::getFalse(*ctx); + + std::vector maskOnes; + llvm::Constant *onMask = NULL; + onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1, + true /*signed*/); // 0xffffffff + + for (int i = 0; i < target.vectorWidth; ++i) + maskOnes.push_back(onMask); + LLVMMaskAllOn = llvm::ConstantVector::get(LLVMTypes::MaskType, maskOnes); + + std::vector maskZeros; + llvm::Constant *offMask = NULL; + offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0, + true /*signed*/); + + for (int i = 0; i < target.vectorWidth; ++i) + maskZeros.push_back(offMask); + LLVMMaskAllOff = llvm::ConstantVector::get(LLVMTypes::MaskType, maskZeros); +} + + +llvm::ConstantInt *LLVMInt32(int32_t ival) { + return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival, + true /*signed*/); +} + + +llvm::ConstantInt * +LLVMUInt32(uint32_t ival) { + return llvm::ConstantInt::get(llvm::Type::getInt32Ty(*g->ctx), ival, + false /*unsigned*/); +} + + +llvm::ConstantInt * +LLVMInt64(int64_t ival) { + return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival, + true /*signed*/); +} + + +llvm::ConstantInt * +LLVMUInt64(uint64_t ival) { + return llvm::ConstantInt::get(llvm::Type::getInt64Ty(*g->ctx), ival, + false /*unsigned*/); +} + + +llvm::Constant * +LLVMFloat(float fval) { + return llvm::ConstantFP::get(llvm::Type::getFloatTy(*g->ctx), fval); +} + + +llvm::Constant * +LLVMDouble(double dval) { + return llvm::ConstantFP::get(llvm::Type::getDoubleTy(*g->ctx), dval); +} + + +llvm::Constant * +LLVMInt32Vector(int32_t ival) { + llvm::Constant *v = LLVMInt32(ival); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); +} + + +llvm::Constant * +LLVMInt32Vector(const int32_t *ivec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMInt32(ivec[i])); + return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); +} + + +llvm::Constant * +LLVMUInt32Vector(uint32_t ival) { + llvm::Constant *v = LLVMUInt32(ival); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); +} + + +llvm::Constant * +LLVMUInt32Vector(const uint32_t *ivec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMUInt32(ivec[i])); + return llvm::ConstantVector::get(LLVMTypes::Int32VectorType, vals); +} + + +llvm::Constant * +LLVMFloatVector(float fval) { + llvm::Constant *v = LLVMFloat(fval); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals); +} + + +llvm::Constant * +LLVMFloatVector(const float *fvec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMFloat(fvec[i])); + return llvm::ConstantVector::get(LLVMTypes::FloatVectorType, vals); +} + + +llvm::Constant * +LLVMDoubleVector(double dval) { + llvm::Constant *v = LLVMDouble(dval); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals); +} + + +llvm::Constant * +LLVMDoubleVector(const double *dvec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMDouble(dvec[i])); + return llvm::ConstantVector::get(LLVMTypes::DoubleVectorType, vals); +} + + +llvm::Constant * +LLVMInt64Vector(int64_t ival) { + llvm::Constant *v = LLVMInt64(ival); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); +} + + +llvm::Constant * +LLVMInt64Vector(const int64_t *ivec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMInt64(ivec[i])); + return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); +} + + +llvm::Constant * +LLVMUInt64Vector(uint64_t ival) { + llvm::Constant *v = LLVMUInt64(ival); + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); +} + + +llvm::Constant * +LLVMUInt64Vector(const uint64_t *ivec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(LLVMUInt64(ivec[i])); + return llvm::ConstantVector::get(LLVMTypes::Int64VectorType, vals); +} + + +llvm::Constant * +LLVMBoolVector(bool b) { + llvm::Constant *v; + if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0, + false /*unsigned*/); + else { + assert(LLVMTypes::BoolVectorType->getElementType() == + llvm::Type::getInt1Ty(*g->ctx)); + v = b ? LLVMTrue : LLVMFalse; + } + + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) + vals.push_back(v); + return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals); +} + + +llvm::Constant * +LLVMBoolVector(const bool *bvec) { + std::vector vals; + for (int i = 0; i < g->target.vectorWidth; ++i) { + llvm::Constant *v; + if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType) + v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0, + false /*unsigned*/); + else { + assert(LLVMTypes::BoolVectorType->getElementType() == + llvm::Type::getInt1Ty(*g->ctx)); + v = bvec[i] ? LLVMTrue : LLVMFalse; + } + + vals.push_back(v); + } + return llvm::ConstantVector::get(LLVMTypes::BoolVectorType, vals); +} + + +const llvm::ArrayType * +LLVMPointerVectorType(const llvm::Type *t) { + // NOTE: ArrayType, not VectorType + return llvm::ArrayType::get(llvm::PointerType::get(t, 0), + g->target.vectorWidth); +} diff --git a/llvmutil.h b/llvmutil.h new file mode 100644 index 00000000..3a5a4e4c --- /dev/null +++ b/llvmutil.h @@ -0,0 +1,157 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file llvmutil.h + @brief Header file with declarations for various LLVM utility stuff +*/ + +#ifndef ISPC_LLVMUTIL_H +#define ISPC_LLVMUTIL_H 1 + +#include "ispc.h" +#include +#include +#include +#include + +/** This structure holds pointers to a variety of LLVM types; code + elsewhere can use them from here, ratherthan needing to make more + verbose LLVM API calls. + */ +struct LLVMTypes { + static const llvm::Type *VoidType; + static const llvm::PointerType *VoidPointerType; + static const llvm::Type *BoolType; + static const llvm::Type *Int8Type; + static const llvm::Type *Int16Type; + static const llvm::Type *Int32Type; + static const llvm::Type *Int32PointerType; + static const llvm::Type *Int64Type; + static const llvm::Type *Int64PointerType; + static const llvm::Type *FloatType; + static const llvm::Type *FloatPointerType; + static const llvm::Type *DoubleType; + + static const llvm::VectorType *MaskType; + static const llvm::VectorType *BoolVectorType; + static const llvm::VectorType *Int1VectorType; + static const llvm::VectorType *Int32VectorType; + static const llvm::Type *Int32VectorPointerType; + static const llvm::VectorType *Int64VectorType; + static const llvm::Type *Int64VectorPointerType; + static const llvm::VectorType *FloatVectorType; + static const llvm::Type *FloatVectorPointerType; + static const llvm::VectorType *DoubleVectorType; + static const llvm::ArrayType *VoidPointerVectorType; +}; + +/** These variables hold the corresponding LLVM constant values as a + convenience to code elsewhere in the system. + */ +extern llvm::Constant *LLVMTrue, *LLVMFalse; + +/** This should be called early in initialization to initialize the members + of LLVMTypes and the LLVMTrue/LLVMFalse constants. However, it can't + be called until the compilation target is known. + */ +extern void InitLLVMUtil(llvm::LLVMContext *ctx, Target target); + +/** Returns an LLVM i32 constant of the given value */ +extern llvm::ConstantInt *LLVMInt32(int32_t i); +/** Returns an LLVM i32 constant of the given value */ +extern llvm::ConstantInt *LLVMUInt32(uint32_t i); +/** Returns an LLVM i64 constant of the given value */ +extern llvm::ConstantInt *LLVMInt64(int64_t i); +/** Returns an LLVM i64 constant of the given value */ +extern llvm::ConstantInt *LLVMUInt64(uint64_t i); +/** Returns an LLVM float constant of the given value */ +extern llvm::Constant *LLVMFloat(float f); +/** Returns an LLVM double constant of the given value */ +extern llvm::Constant *LLVMDouble(double f); + +/** Returns an LLVM boolean vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMBoolVector(bool v); +/** Returns an LLVM i32 vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMInt32Vector(int32_t i); +/** Returns an LLVM i32 vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMUInt32Vector(uint32_t i); +/** Returns an LLVM i64 vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMInt64Vector(int64_t i); +/** Returns an LLVM i64 vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMUInt64Vector(uint64_t i); +/** Returns an LLVM float vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMFloatVector(float f); +/** Returns an LLVM double vector constant of the given value smeared + across all elements */ +extern llvm::Constant *LLVMDoubleVector(double f); + +/** Returns an LLVM boolean vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMBoolVector(const bool *v); +/** Returns an LLVM i32 vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMInt32Vector(const int32_t *i); +/** Returns an LLVM i32 vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMUInt32Vector(const uint32_t *i); +/** Returns an LLVM i64 vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMInt64Vector(const int64_t *i); +/** Returns an LLVM i64 vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMUInt64Vector(const uint64_t *i); +/** Returns an LLVM float vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMFloatVector(const float *f); +/** Returns an LLVM double vector based on the given array of values. + The array should have g->target.vectorWidth elements. */ +extern llvm::Constant *LLVMDoubleVector(const double *f); + +/** LLVM constant value representing an 'all on' SIMD lane mask */ +extern llvm::Constant *LLVMMaskAllOn; +/** LLVM constant value representing an 'all off' SIMD lane mask */ +extern llvm::Constant *LLVMMaskAllOff; + +/** Given an LLVM type, returns the corresponding type for a vector of + pointers to that type. (In practice, an array of pointers, since LLVM + prohibits vectors of pointers. + */ +extern const llvm::ArrayType *LLVMPointerVectorType(const llvm::Type *t); + +#endif // ISPC_LLVMUTIL_H diff --git a/main.cpp b/main.cpp new file mode 100644 index 00000000..09034828 --- /dev/null +++ b/main.cpp @@ -0,0 +1,330 @@ +/* + Copyright (c) 2010-2011, Intel Corporation + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/** @file main.cpp + @brief main() entrypoint implementation for ispc +*/ + +#include "ispc.h" +#include "module.h" +#include +#include +#ifdef LLVM_2_8 +#include +#else +#include +#endif + +#ifdef ISPC_IS_WINDOWS +#define strcasecmp stricmp +#define BUILD_DATE __DATE__ +#define BUILD_VERSION "" +#endif // ISPC_IS_WINDOWS + +static void usage(int ret) { + printf("This is the Intel(r) SPMD Program Compiler (ispc), build %s (%s)\n\n", BUILD_DATE, BUILD_VERSION); + printf("usage: ispc\n"); + printf(" [--arch={x86,x86-64}]\t\tSelect target architecture\n"); + printf(" [--cpu=]\t\t\tSelect target CPU type\n"); + printf(" (atom, barcelona, core2, corei7, corei7-avx, istanbul, nocona,\n"); + printf(" penryn, westmere)\n"); +#ifndef ISPC_IS_WINDOWS + printf(" [-D]\t\t\t\t#define value when running preprocessor\n"); +#endif + printf(" [--debug]\t\t\t\tPrint information useful for debugging ispc\n"); + printf(" [--emit-asm]\t\t\tGenerate assembly language file as output\n"); + printf(" [--emit-llvm]\t\t\tEmit LLVM bitode file as output\n"); + printf(" [--emit-obj]\t\t\tGenerate object file file as output\n"); + printf(" [--fast-math]\t\t\tPerform non-IEEE-compliant optimizations of numeric expressions\n"); + printf(" [-g]\t\t\t\tGenerate debugging information\n"); + printf(" [--help]\t\t\t\tPrint help\n"); + printf(" [-h] \t\t\t\tOutput filename for header\n"); + printf(" [--instrument]\t\t\tEmit instrumentation to gather performance data\n"); + printf(" [--math-lib=