Add support for ARM NEON targets.
Initial support for ARM NEON on Cortex-A9 and A15 CPUs. All but ~10 tests
pass, and all examples compile and run correctly. Most of the examples
show a ~2x speedup on a single A15 core versus scalar code.
Current open issues/TODOs
- Code quality looks decent, but hasn't been carefully examined. Known
issues/opportunities for improvement include:
- fp32 vector divide is done as a series of scalar divides rather than
a vector divide (which I believe exists, but I may be mistaken.)
This is particularly harmful to examples/rt, which only runs ~1.5x
faster with ispc, likely due to long chains of scalar divides.
- The compiler isn't generating a vmin.f32 for e.g. the final scalar
min in reduce_min(); instead it's generating a compare and then a
select instruction (and similarly elsewhere).
- There are some additional FIXMEs in builtins/target-neon.ll that
include both a few pieces of missing functionality (e.g. rounding
doubles) as well as places that deserve attention for possible
code quality improvements.
- Currently only the "cortex-a9" and "cortex-15" CPU targets are
supported; LLVM supports many other ARM CPUs and ispc should provide
access to all of the ones that have NEON support (and aren't too
obscure.)
- ~5 of the reduce-* tests hit an assertion inside LLVM (unfortunately
only when the compiler runs on an ARM host, though).
- The Windows build hasn't been tested (though I've tried to update
ispc.vcxproj appropriately). It may just work, but will more likely
have various small issues.)
- Anything related to 64-bit ARM has seen no attention.
This commit is contained in:
47
ispc.cpp
47
ispc.cpp
@@ -85,7 +85,7 @@ Module *m;
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Target
|
||||
|
||||
#ifndef ISPC_IS_WINDOWS
|
||||
#if !defined(ISPC_IS_WINDOWS) && !defined(__arm__)
|
||||
static void __cpuid(int info[4], int infoType) {
|
||||
__asm__ __volatile__ ("cpuid"
|
||||
: "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
@@ -100,11 +100,14 @@ static void __cpuidex(int info[4], int level, int count) {
|
||||
: "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
|
||||
: "0" (level), "2" (count));
|
||||
}
|
||||
#endif // ISPC_IS_WINDOWS
|
||||
#endif // !ISPC_IS_WINDOWS && !__ARM__
|
||||
|
||||
|
||||
static const char *
|
||||
lGetSystemISA() {
|
||||
#ifdef __arm__
|
||||
return "neon";
|
||||
#else
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
|
||||
@@ -133,10 +136,15 @@ lGetSystemISA() {
|
||||
fprintf(stderr, "Unable to detect supported SSE/AVX ISA. Exiting.\n");
|
||||
exit(1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static const char *supportedCPUs[] = {
|
||||
// FIXME: LLVM supports a ton of different ARM CPU variants--not just
|
||||
// cortex-a9 and a15. We should be able to handle any of them that also
|
||||
// have NEON support.
|
||||
"cortex-a9", "cortex-a15",
|
||||
"atom", "penryn", "core2", "corei7", "corei7-avx"
|
||||
#if !defined(LLVM_3_1)
|
||||
, "core-avx-i", "core-avx2"
|
||||
@@ -177,6 +185,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
// possible ISA based on that.
|
||||
if (!strcmp(cpu, "core-avx2"))
|
||||
isa = "avx2";
|
||||
else if (!strcmp(cpu, "cortex-a9") ||
|
||||
!strcmp(cpu, "cortex-a15"))
|
||||
isa = "neon";
|
||||
else if (!strcmp(cpu, "core-avx-i"))
|
||||
isa = "avx1.1";
|
||||
else if (!strcmp(cpu, "sandybridge") ||
|
||||
@@ -200,6 +211,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__arm__)
|
||||
if (cpu == NULL && !strcmp(isa, "neon"))
|
||||
// If we're compiling NEON on an x86 host and the CPU wasn't
|
||||
// supplied, don't go and set the CPU based on the host...
|
||||
cpu = "cortex-a9";
|
||||
#endif
|
||||
|
||||
if (cpu == NULL) {
|
||||
std::string hostCPU = llvm::sys::getHostCPUName();
|
||||
if (hostCPU.size() > 0)
|
||||
@@ -227,8 +245,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
|
||||
this->m_cpu = cpu;
|
||||
|
||||
if (arch == NULL)
|
||||
arch = "x86-64";
|
||||
if (arch == NULL) {
|
||||
if (!strcmp(isa, "neon"))
|
||||
arch = "arm";
|
||||
else
|
||||
arch = "x86-64";
|
||||
}
|
||||
|
||||
bool error = false;
|
||||
|
||||
@@ -423,6 +445,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
this->m_hasGather = true;
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "neon")) {
|
||||
this->m_isa = Target::NEON;
|
||||
this->m_nativeVectorWidth = 4;
|
||||
this->m_vectorWidth = 4;
|
||||
this->m_attributes = "+neon,+fp16";
|
||||
this->m_hasHalf = true; // ??
|
||||
this->m_maskingIsFree = false;
|
||||
this->m_maskBitCount = 32;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Target ISA \"%s\" is unknown. Choices are: %s\n",
|
||||
isa, SupportedTargetISAs());
|
||||
@@ -437,6 +468,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
|
||||
llvm::Reloc::Default;
|
||||
std::string featuresString = m_attributes;
|
||||
llvm::TargetOptions options;
|
||||
if (m_isa == Target::NEON)
|
||||
options.FloatABIType = llvm::FloatABI::Hard;
|
||||
#if !defined(LLVM_3_1)
|
||||
if (g->opt.disableFMA == false)
|
||||
options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
@@ -528,13 +561,13 @@ Target::SupportedTargetCPUs() {
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetArchs() {
|
||||
return "x86, x86-64";
|
||||
return "arm, x86, x86-64";
|
||||
}
|
||||
|
||||
|
||||
const char *
|
||||
Target::SupportedTargetISAs() {
|
||||
return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
|
||||
return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
|
||||
", avx1.1, avx1.1-x2, avx2, avx2-x2"
|
||||
", generic-1, generic-4, generic-8, generic-16, generic-32";
|
||||
}
|
||||
@@ -565,6 +598,8 @@ Target::GetTripleString() const {
|
||||
const char *
|
||||
Target::ISAToString(ISA isa) {
|
||||
switch (isa) {
|
||||
case Target::NEON:
|
||||
return "neon";
|
||||
case Target::SSE2:
|
||||
return "sse2";
|
||||
case Target::SSE4:
|
||||
|
||||
Reference in New Issue
Block a user