Merge branch 'master' into arm

Conflicts:
	Makefile
	builtins.cpp
	ispc.cpp
	ispc.h
	ispc.vcxproj
	opt.cpp
This commit is contained in:
Matt Pharr
2013-08-06 17:09:48 -07:00
15 changed files with 298 additions and 123 deletions

View File

@@ -39,6 +39,10 @@
LLVM_CONFIG=$(shell which llvm-config)
CLANG_INCLUDE=$(shell $(LLVM_CONFIG) --includedir)
# Enable ARM by request
# To enable: make ARM_ENABLED=1
ARM_ENABLED=0
# Add llvm bin to the path so any scripts run will go to the right llvm-config
LLVM_BIN= $(shell $(LLVM_CONFIG) --bindir)
export PATH:=$(LLVM_BIN):$(PATH)
@@ -55,12 +59,15 @@ LLVM_CXXFLAGS=$(shell $(LLVM_CONFIG) --cppflags)
LLVM_VERSION=LLVM_$(shell $(LLVM_CONFIG) --version | sed -e s/\\./_/ -e s/svn//)
LLVM_VERSION_DEF=-D$(LLVM_VERSION)
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker arm
LLVM_COMPONENTS = engine ipo bitreader bitwriter instrumentation linker
# Component "option" was introduced in 3.3 and starting with 3.4 it is required for the link step.
# We check if it's available before adding it (to not break 3.2 and earlier).
ifeq ($(shell $(LLVM_CONFIG) --components |grep -c option), 1)
LLVM_COMPONENTS+=option
endif
ifneq ($(ARM_ENABLED), 0)
LLVM_COMPONENTS+=arm
endif
LLVM_LIBS=$(shell $(LLVM_CONFIG) --libs $(LLVM_COMPONENTS))
CLANG=clang
@@ -104,6 +111,9 @@ OPT=-O2
CXXFLAGS=$(OPT) $(LLVM_CXXFLAGS) -I. -Iobjs/ -I$(CLANG_INCLUDE) \
-Wall $(LLVM_VERSION_DEF) \
-DBUILD_DATE="\"$(BUILD_DATE)\"" -DBUILD_VERSION="\"$(BUILD_VERSION)\""
ifneq ($(ARM_ENABLED), 0)
CXXFLAGS+=-DISPC_ARM_ENABLED
endif
LDFLAGS=
ifeq ($(ARCH_OS),Linux)
@@ -122,10 +132,12 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
type.cpp util.cpp
HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
opt.h stmt.h sym.h type.h util.h
TARGETS=neon-32 neon-16 neon-8 \
avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
ifneq ($(ARM_ENABLED), 0)
TARGETS+=neon-32 neon-16 neon-8
endif
# These files need to be compiled in two versions - 32 and 64 bits.
BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
# These are files to be compiled in single version.
@@ -139,7 +151,7 @@ BISON_SRC=parse.yy
FLEX_SRC=lex.ll
OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
default: ispc
@@ -264,4 +276,3 @@ objs/stdlib_mask32_ispc.cpp: stdlib.ispc
@echo Creating C++ source from $< for mask32
@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
python stdlib2cpp.py mask32 > $@

View File

@@ -631,7 +631,7 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
llvm::Triple bcTriple(bcModule->getTargetTriple());
Debug(SourcePos(), "module triple: %s\nbitcode triple: %s\n",
mTriple.str().c_str(), bcTriple.str().c_str());
#ifndef __arm__
#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
// FIXME: More ugly and dangerous stuff. We really haven't set up
// proper build and runtime infrastructure for ispc to do
// cross-compilation, yet it's at minimum useful to be able to emit
@@ -812,6 +812,8 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
// Next, add the target's custom implementations of the various needed
// builtin functions (e.g. __masked_store_32(), etc).
switch (g->target->getISA()) {
#ifdef ISPC_ARM_ENABLED
case Target::NEON8: {
if (runtime32) {
EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
@@ -839,6 +841,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
}
break;
}
#endif
case Target::SSE2: {
switch (g->target->getVectorWidth()) {
case 4:

View File

@@ -1,7 +1,7 @@
EXAMPLE=mandelbrot
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
ISPC_SRC=mandelbrot.ispc
EXAMPLE=mandelbrot_tasks
CPP_SRC=mandelbrot_tasks.cpp mandelbrot_tasks_serial.cpp
ISPC_SRC=mandelbrot_tasks.ispc
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
ISPC_ARM_TARGETS=neon

View File

@@ -42,7 +42,7 @@
#include <algorithm>
#include <string.h>
#include "../timing.h"
#include "mandelbrot_ispc.h"
#include "mandelbrot_tasks_ispc.h"
using namespace ispc;
extern void mandelbrot_serial(float x0, float y0, float x1, float y1,

View File

@@ -21,7 +21,7 @@
<PropertyGroup Label="Globals">
<ProjectGuid>{E80DA7D4-AB22-4648-A068-327307156BE6}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>mandelbrot</RootNamespace>
<RootNamespace>mandelbrot_tasks</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
@@ -65,22 +65,22 @@
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<ExecutablePath>$(ProjectDir)..\..;$(ExecutablePath)</ExecutablePath>
<TargetName>mandelbrot</TargetName>
<TargetName>mandelbrot_tasks</TargetName>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
@@ -153,12 +153,12 @@
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="mandelbrot.cpp" />
<ClCompile Include="mandelbrot_serial.cpp" />
<ClCompile Include="mandelbrot_tasks.cpp" />
<ClCompile Include="mandelbrot_tasks_serial.cpp" />
<ClCompile Include="../tasksys.cpp" />
</ItemGroup>
<ItemGroup>
<CustomBuild Include="mandelbrot.ispc">
<CustomBuild Include="mandelbrot_tasks.ispc">
<FileType>Document</FileType>
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">ispc -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=sse2,sse4-x2,avx-x2
</Command>

View File

@@ -22,7 +22,7 @@ mandelbrot
#***
Mandelbrot Set
mandelbrot_tasks
mandelbrot
mandelbrot_tasks
^
#***
Perlin Noise Function

View File

@@ -73,10 +73,19 @@ def cpu_get():
#returns cpu_usage
def cpu_check():
if is_windows == False:
cpu1 = cpu_get()
time.sleep(1)
cpu2 = cpu_get()
cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
if is_mac == False:
cpu1 = cpu_get()
time.sleep(1)
cpu2 = cpu_get()
cpu_percent = (float(cpu1[0] - cpu2[0])/float(cpu1[1] - cpu2[1]))*100
else:
os.system("sysctl -n vm.loadavg > cpu_temp")
c = open("cpu_temp", 'r')
c_line = c.readline()
c.close
os.remove("cpu_temp")
R = c_line.split(' ')
cpu_percent = float(R[1]) * 3
else:
os.system("wmic cpu get loadpercentage /value > cpu_temp")
c = open("cpu_temp", 'r')
@@ -143,6 +152,8 @@ parser.add_option('-p', '--path', dest='path',
global is_windows
is_windows = (platform.system() == 'Windows' or
'CYGWIN_NT' in platform.system())
global is_mac
is_mac = (platform.system() == 'Darwin')
# save corrent path
pwd = os.getcwd()

View File

@@ -365,7 +365,7 @@ lAtomicCompareAndSwap32(volatile int32_t *v, int32_t newValue, int32_t oldValue)
static inline int32_t
lAtomicAdd(volatile int32_t *v, int32_t delta) {
#ifdef ISPC_IS_WINDOWS
return InterlockedAdd((volatile LONG *)v, delta);
return InterlockedExchangeAdd((volatile LONG *)v, delta)+delta;
#else
return __sync_fetch_and_add(v, delta);
#endif

View File

@@ -141,10 +141,12 @@ lGetSystemISA() {
static const char *supportedCPUs[] = {
#ifdef ISPC_ARM_ENABLED
// FIXME: LLVM supports a ton of different ARM CPU variants--not just
// cortex-a9 and a15. We should be able to handle any of them that also
// have NEON support.
"cortex-a9", "cortex-a15",
#endif
"atom", "penryn", "core2", "corei7", "corei7-avx"
#if !defined(LLVM_3_1)
, "core-avx-i", "core-avx2"
@@ -185,9 +187,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
// possible ISA based on that.
if (!strcmp(cpu, "core-avx2"))
isa = "avx2";
#ifdef ISPC_ARM_ENABLED
else if (!strcmp(cpu, "cortex-a9") ||
!strcmp(cpu, "cortex-a15"))
isa = "neon-32";
#endif
else if (!strcmp(cpu, "core-avx-i"))
isa = "avx1.1";
else if (!strcmp(cpu, "sandybridge") ||
@@ -211,7 +215,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
}
}
#if !defined(__arm__)
#if defined(ISPC_ARM_ENABLED) && !defined(__arm__)
if (cpu == NULL && !strncmp(isa, "neon", 4))
// If we're compiling NEON on an x86 host and the CPU wasn't
// supplied, don't go and set the CPU based on the host...
@@ -246,9 +250,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_cpu = cpu;
if (arch == NULL) {
#ifdef ISPC_ARM_ENABLED
if (!strncmp(isa, "neon", 4))
arch = "arm";
else
#endif
arch = "x86-64";
}
@@ -461,6 +467,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_hasGather = true;
#endif
}
#ifdef ISPC_ARM_ENABLED
else if (!strcasecmp(isa, "neon-8")) {
this->m_isa = Target::NEON8;
this->m_nativeVectorWidth = 16;
@@ -488,6 +495,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
this->m_maskingIsFree = false;
this->m_maskBitCount = 32;
}
#endif
else {
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
isa, SupportedTargetISAs());
@@ -502,9 +510,11 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
llvm::Reloc::Default;
std::string featuresString = m_attributes;
llvm::TargetOptions options;
#ifdef ISPC_ARM_ENABLED
if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
m_isa == Target::NEON32)
options.FloatABIType = llvm::FloatABI::Hard;
#endif
#if !defined(LLVM_3_1)
if (g->opt.disableFMA == false)
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -596,13 +606,21 @@ Target::SupportedTargetCPUs() {
const char *
Target::SupportedTargetArchs() {
return "arm, x86, x86-64";
return
#ifdef ISPC_ARM_ENABLED
"arm, "
#endif
"x86, x86-64";
}
const char *
Target::SupportedTargetISAs() {
return "neon-8, neon-16, neon-32, sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
return
#ifdef ISPC_ARM_ENABLED
"neon-8, neon-16, neon-32, "
#endif
"sse2, sse2-x2, sse4, sse4-8, sse4-16, sse4-x2, "
"avx, avx-x2, avx1.1, avx1.1-x2, avx2, avx2-x2, "
"generic-1, generic-4, generic-8, generic-16, generic-32";
}
@@ -611,10 +629,13 @@ Target::SupportedTargetISAs() {
std::string
Target::GetTripleString() const {
llvm::Triple triple;
#ifdef ISPC_ARM_ENABLED
if (m_arch == "arm") {
triple.setTriple("armv7-eabi");
}
else {
else
#endif
{
// Start with the host triple as the default
triple.setTriple(llvm::sys::getDefaultTargetTriple());
@@ -637,12 +658,14 @@ Target::GetTripleString() const {
const char *
Target::ISAToString(ISA isa) {
switch (isa) {
#ifdef ISPC_ARM_ENABLED
case Target::NEON8:
return "neon-8";
case Target::NEON16:
return "neon-16";
case Target::NEON32:
return "neon-32";
#endif
case Target::SSE2:
return "sse2";
case Target::SSE4:
@@ -813,6 +836,7 @@ Globals::Globals() {
includeStdlib = true;
runCPP = true;
debugPrint = false;
debugIR = -1;
disableWarnings = false;
warningsAsErrors = false;
quiet = false;

20
ispc.h
View File

@@ -59,6 +59,7 @@
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <set>
#include <string>
/** @def ISPC_MAX_NVEC maximum vector size of any of the compliation
@@ -66,6 +67,9 @@
*/
#define ISPC_MAX_NVEC 64
// Number of final optimization phase
#define LAST_OPT_NUMBER 1000
// Forward declarations of a number of widely-used LLVM types
namespace llvm {
class AttributeSet;
@@ -175,7 +179,11 @@ public:
flexible/performant of them will apear last in the enumerant. Note
also that __best_available_isa() needs to be updated if ISAs are
added or the enumerant values are reordered. */
enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
enum ISA {
#ifdef ISPC_ARM_ENABLED
NEON32, NEON16, NEON8,
#endif
SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
NUM_ISAS };
/** Initializes the given Target pointer for a target of the given
@@ -495,6 +503,16 @@ struct Globals {
ispc's execution. */
bool debugPrint;
/** Indicates which stages of optimization we want to dump. */
std::set<int> debug_stages;
/** Indicates after which optimization we want to generate
DebugIR information. */
int debugIR;
/** Indicates which phases of optimization we want to switch off. */
std::set<int> off_stages;
/** Indicates whether all warning messages should be surpressed. */
bool disableWarnings;

View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
@@ -45,12 +45,6 @@
<ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
<ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -192,59 +186,24 @@
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-8.ll">
<CustomBuild Include="builtins\target-avx1.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Message>Building gen-bitcode-avx1-32bit.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-8.ll">
<CustomBuild Include="builtins\target-avx1.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-16.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-16.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-32.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
</CustomBuild>
</ItemGroup>
<ItemGroup>
<CustomBuild Include="builtins\target-neon-32.ll">
<FileType>Document</FileType>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
<Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
<Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
<Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
<AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
<Message>Building gen-bitcode-avx1-64bit.cpp</Message>
</CustomBuild>
</ItemGroup>
=======
<ItemGroup>
<CustomBuild Include="builtins\target-avx1.ll">
<FileType>Document</FileType>
@@ -263,6 +222,7 @@
<Message>Building gen-bitcode-avx1-64bit.cpp</Message>
</CustomBuild>
</ItemGroup>
>>>>>>> master
<ItemGroup>
<CustomBuild Include="builtins\target-avx1-x2.ll">
<FileType>Document</FileType>

View File

@@ -155,6 +155,11 @@ devUsage(int ret) {
printf(" disable-uniform-control-flow\t\tDisable uniform control flow optimizations\n");
printf(" disable-uniform-memory-optimizations\tDisable uniform-based coherent memory access\n");
printf(" [--yydebug]\t\t\t\tPrint debugging information during parsing\n");
printf(" [--debug-phase=<value>]\t\tSet optimization phases to dump. --debug-phase=first,210:220,300,305,310:last\n");
#ifdef LLVM_3_4
printf(" [--debug-ir=<value>]\t\tSet optimization phase to generate debugIR after it\n");
#endif
printf(" [--off-phase=<value>]\t\tSwitch off optimization phases. --off-phase=first,210:220,300,305,310:last\n");
exit(ret);
}
@@ -211,6 +216,47 @@ lSignal(void *) {
}
static int ParsingPhaseName(char * stage) {
if (strncmp(stage, "first", 5) == 0) {
return 0;
}
else if (strncmp(stage, "last", 4) == 0) {
return LAST_OPT_NUMBER;
}
else {
int t = atoi(stage);
if (t < 0 || t > LAST_OPT_NUMBER) {
fprintf(stderr, "Phases must be from 0 to %d. %s is incorrect.\n", LAST_OPT_NUMBER, stage);
exit(0);
}
else {
return t;
}
}
}
static std::set<int> ParsingPhases(char * stages) {
std::set<int> phases;
int begin = ParsingPhaseName(stages);
int end = begin;
for (unsigned i = 0; i < strlen(stages); i++) {
if ((stages[i] == ',') || (i == strlen(stages) - 1)) {
for (int j = begin; j < end + 1; j++) {
phases.insert(j);
}
begin = ParsingPhaseName(stages + i + 1);
end = begin;
}
else if (stages[i] == ':') {
end = ParsingPhaseName(stages + i + 1);
}
}
return phases;
}
static void
lParseInclude(const char *path) {
#ifdef ISPC_IS_WINDOWS
@@ -253,6 +299,8 @@ int main(int Argc, char *Argv[]) {
LLVMInitializeX86Disassembler();
LLVMInitializeX86TargetMC();
#endif // !__ARM__
#ifdef ISPC_ARM_ENABLED
// Generating ARM from x86 is more likely to be useful, though.
LLVMInitializeARMTargetInfo();
LLVMInitializeARMTarget();
@@ -260,6 +308,7 @@ int main(int Argc, char *Argv[]) {
LLVMInitializeARMAsmParser();
LLVMInitializeARMDisassembler();
LLVMInitializeARMTargetMC();
#endif
char *file = NULL;
const char *headerFileName = NULL;
@@ -486,6 +535,20 @@ int main(int Argc, char *Argv[]) {
}
hostStubFileName = argv[i];
}
else if (strncmp(argv[i], "--debug-phase=", 14) == 0) {
fprintf(stderr, "WARNING: Adding debug phases may change the way PassManager"
"handles the phases and it may possibly make some bugs go"
"away or introduce the new ones.\n");
g->debug_stages = ParsingPhases(argv[i] + strlen("--debug-phase="));
}
#ifdef LLVM_3_4
else if (strncmp(argv[i], "--debug-ir=", 11) == 0) {
g->debugIR = ParsingPhaseName(argv[i] + strlen("--debug-ir="));
}
#endif
else if (strncmp(argv[i], "--off-phase=", 12) == 0) {
g->off_stages = ParsingPhases(argv[i] + strlen("--off-phase="));
}
else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
lPrintVersion();
return 0;

145
opt.cpp
View File

@@ -63,6 +63,9 @@
#include <llvm/IR/BasicBlock.h>
#include <llvm/IR/Constants.h>
#endif
#if defined (LLVM_3_4)
#include <llvm/Transforms/Instrumentation.h>
#endif
#include <llvm/PassManager.h>
#include <llvm/PassRegistry.h>
#include <llvm/Assembly/PrintModulePass.h>
@@ -119,6 +122,8 @@ static llvm::Pass *CreateReplacePseudoMemoryOpsPass();
static llvm::Pass *CreateIsCompileTimeConstantPass(bool isLastTry);
static llvm::Pass *CreateMakeInternalFuncsStaticPass();
static llvm::Pass *CreateDebugPass(char * output);
#define DEBUG_START_PASS(NAME) \
if (g->debugPrint && \
(getenv("FUNC") == NULL || \
@@ -395,6 +400,54 @@ lGetMaskStatus(llvm::Value *mask, int vecWidth = -1) {
}
///////////////////////////////////////////////////////////////////////////
// This is a wrap over class llvm::PassManager. This duplicates PassManager function run()
// and change PassManager function add by adding some checks and debug passes.
// This wrap can control:
// - If we want to switch off optimization with given number.
// - If we want to dump LLVM IR after optimization with given number.
// - If we want to generate LLVM IR debug for gdb after optimization with given number.
class DebugPassManager {
public:
DebugPassManager():number(0){}
void add(llvm::Pass * P, int stage);
bool run(llvm::Module& M) {return PM.run(M);}
llvm::PassManager& getPM() {return PM;}
private:
llvm::PassManager PM;
int number;
};
void
DebugPassManager::add(llvm::Pass * P, int stage = -1) {
// taking number of optimization
if (stage == -1) {
number++;
}
else {
number = stage;
}
if (g->off_stages.find(number) == g->off_stages.end()) {
// adding optimization (not switched off)
PM.add(P);
if (g->debug_stages.find(number) != g->debug_stages.end()) {
// adding dump of LLVM IR after optimization
char buf[100];
sprintf(buf, "\n\n*****LLVM IR after phase %d: %s*****\n\n",
number, P->getPassName());
PM.add(CreateDebugPass(buf));
}
#ifdef LLVM_3_4
if (g->debugIR == number) {
// adding generating of LLVM IR debug after optimization
char buf[100];
sprintf(buf, "Debug_IR_after_%d_phase.bc", number);
PM.add(llvm::createDebugIRPass(true, true, ".", buf));
}
#endif
}
}
///////////////////////////////////////////////////////////////////////////
void
@@ -403,14 +456,8 @@ Optimize(llvm::Module *module, int optLevel) {
printf("*** Code going into optimization ***\n");
module->dump();
}
llvm::PassManager optPM;
optPM.add(llvm::createVerifierPass());
#if 0
std::string err;
optPM.add(llvm::createPrintModulePass(new llvm::raw_fd_ostream("-", err)));
#endif
DebugPassManager optPM;
optPM.add(llvm::createVerifierPass(),0);
llvm::TargetLibraryInfo *targetLibraryInfo =
new llvm::TargetLibraryInfo(llvm::Triple(module->getTargetTriple()));
@@ -427,7 +474,7 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(new llvm::TargetTransformInfo(targetMachine->getScalarTargetTransformInfo(),
targetMachine->getVectorTargetTransformInfo()));
#else // LLVM 3.3+
targetMachine->addAnalysisPasses(optPM);
targetMachine->addAnalysisPasses(optPM.getPM());
#endif
#endif
@@ -439,11 +486,11 @@ Optimize(llvm::Module *module, int optLevel) {
// run absolutely no optimizations, since the front-end needs us to
// take the various __pseudo_* functions it has emitted and turn
// them into something that can actually execute.
optPM.add(CreateImproveMemoryOpsPass());
optPM.add(CreateImproveMemoryOpsPass(), 100);
if (g->opt.disableHandlePseudoMemoryOps == false)
optPM.add(CreateReplacePseudoMemoryOpsPass());
optPM.add(CreateIntrinsicsOptPass());
optPM.add(CreateIntrinsicsOptPass(), 102);
optPM.add(CreateIsCompileTimeConstantPass(true));
optPM.add(llvm::createFunctionInliningPass());
optPM.add(CreateMakeInternalFuncsStaticPass());
@@ -462,7 +509,7 @@ Optimize(llvm::Module *module, int optLevel) {
llvm::initializeInstrumentation(*registry);
llvm::initializeTarget(*registry);
optPM.add(llvm::createGlobalDCEPass());
optPM.add(llvm::createGlobalDCEPass(), 200);
// Early optimizations to try to reduce the total amount of code to
// work with if we can
@@ -476,14 +523,14 @@ Optimize(llvm::Module *module, int optLevel) {
if (g->opt.disableGatherScatterOptimizations == false &&
g->target->getVectorWidth() > 1) {
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createInstructionCombiningPass(), 210);
optPM.add(CreateImproveMemoryOpsPass());
}
if (!g->opt.disableMaskAllOnOptimizations) {
optPM.add(CreateIntrinsicsOptPass());
optPM.add(CreateIntrinsicsOptPass(), 215);
optPM.add(CreateInstructionSimplifyPass());
}
optPM.add(llvm::createDeadInstEliminationPass());
optPM.add(llvm::createDeadInstEliminationPass(), 220);
// Max struct size threshold for scalar replacement is
// 1) 4 fields (r,g,b,w)
@@ -513,10 +560,10 @@ Optimize(llvm::Module *module, int optLevel) {
#if defined(LLVM_3_1) || defined(LLVM_3_2) || defined(LLVM_3_3)
// Starting from 3.4 this functionality was moved to
// InstructionCombiningPass. See r184459 for details.
optPM.add(llvm::createSimplifyLibCallsPass());
optPM.add(llvm::createSimplifyLibCallsPass(), 240);
#endif
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createInstructionCombiningPass(), 241);
optPM.add(llvm::createJumpThreadingPass());
optPM.add(llvm::createCFGSimplificationPass());
optPM.add(llvm::createScalarReplAggregatesPass(sr_threshold));
@@ -524,44 +571,45 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createTailCallEliminationPass());
if (!g->opt.disableMaskAllOnOptimizations) {
optPM.add(CreateIntrinsicsOptPass());
optPM.add(CreateIntrinsicsOptPass(), 250);
optPM.add(CreateInstructionSimplifyPass());
}
if (g->opt.disableGatherScatterOptimizations == false &&
g->target->getVectorWidth() > 1) {
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createInstructionCombiningPass(), 255);
optPM.add(CreateImproveMemoryOpsPass());
if (g->opt.disableCoalescing == false &&
g->target->getISA() != Target::GENERIC) {
// It is important to run this here to make it easier to
// finding matching gathers we can coalesce..
optPM.add(llvm::createEarlyCSEPass());
optPM.add(llvm::createEarlyCSEPass(), 260);
optPM.add(CreateGatherCoalescePass());
}
}
optPM.add(llvm::createFunctionInliningPass());
optPM.add(llvm::createFunctionInliningPass(), 265);
optPM.add(llvm::createConstantPropagationPass());
optPM.add(CreateIntrinsicsOptPass());
optPM.add(CreateInstructionSimplifyPass());
if (g->opt.disableGatherScatterOptimizations == false &&
g->target->getVectorWidth() > 1) {
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createInstructionCombiningPass(), 270);
optPM.add(CreateImproveMemoryOpsPass());
}
optPM.add(llvm::createIPSCCPPass());
optPM.add(llvm::createIPSCCPPass(), 275);
optPM.add(llvm::createDeadArgEliminationPass());
optPM.add(llvm::createAggressiveDCEPass());
optPM.add(llvm::createInstructionCombiningPass());
optPM.add(llvm::createCFGSimplificationPass());
if (g->opt.disableHandlePseudoMemoryOps == false)
optPM.add(CreateReplacePseudoMemoryOpsPass());
optPM.add(CreateIntrinsicsOptPass());
if (g->opt.disableHandlePseudoMemoryOps == false) {
optPM.add(CreateReplacePseudoMemoryOpsPass(),280);
}
optPM.add(CreateIntrinsicsOptPass(),281);
optPM.add(CreateInstructionSimplifyPass());
optPM.add(llvm::createFunctionInliningPass());
@@ -579,9 +627,10 @@ Optimize(llvm::Module *module, int optLevel) {
optPM.add(llvm::createIndVarSimplifyPass());
optPM.add(llvm::createLoopIdiomPass());
optPM.add(llvm::createLoopDeletionPass());
if (g->opt.unrollLoops)
optPM.add(llvm::createLoopUnrollPass());
optPM.add(llvm::createGVNPass());
if (g->opt.unrollLoops) {
optPM.add(llvm::createLoopUnrollPass(), 300);
}
optPM.add(llvm::createGVNPass(), 301);
optPM.add(CreateIsCompileTimeConstantPass(true));
optPM.add(CreateIntrinsicsOptPass());
@@ -609,7 +658,7 @@ Optimize(llvm::Module *module, int optLevel) {
// Finish up by making sure we didn't mess anything up in the IR along
// the way.
optPM.add(llvm::createVerifierPass());
optPM.add(llvm::createVerifierPass(), LAST_OPT_NUMBER);
optPM.run(*module);
if (g->debugPrint) {
@@ -4330,6 +4379,42 @@ CreateIsCompileTimeConstantPass(bool isLastTry) {
return new IsCompileTimeConstantPass(isLastTry);
}
//////////////////////////////////////////////////////////////////////////
// DebugPass
/** This pass is added in list of passes after optimizations which
we want to debug and print dump of LLVM IR in stderr. Also it
prints name and number of previous optimization.
*/
class DebugPass : public llvm::ModulePass {
public:
static char ID;
DebugPass(char * output) : ModulePass(ID) {
sprintf(str_output, "%s", output);
}
const char *getPassName() const { return "Dump LLVM IR"; }
bool runOnModule(llvm::Module &m);
private:
char str_output[100];
};
char DebugPass::ID = 0;
bool
DebugPass::runOnModule(llvm::Module &module) {
fprintf(stderr, "%s", str_output);
fflush(stderr);
module.dump();
return true;
}
static llvm::Pass *
CreateDebugPass(char * output) {
return new DebugPass(output);
}
///////////////////////////////////////////////////////////////////////////
// MakeInternalFuncsStaticPass