Add support for ARM NEON targets.

Initial support for ARM NEON on Cortex-A9 and A15 CPUs. All but ~10 tests pass, and all examples compile and run correctly. Most of the examples show a ~2x speedup on a single A15 core versus scalar code. Current open issues/TODOs - Code quality looks decent, but hasn't been carefully examined. Known issues/opportunities for improvement include: - fp32 vector divide is done as a series of scalar divides rather than a vector divide (which I believe exists, but I may be mistaken.) This is particularly harmful to examples/rt, which only runs ~1.5x faster with ispc, likely due to long chains of scalar divides. - The compiler isn't generating a vmin.f32 for e.g. the final scalar min in reduce_min(); instead it's generating a compare and then a select instruction (and similarly elsewhere). - There are some additional FIXMEs in builtins/target-neon.ll that include both a few pieces of missing functionality (e.g. rounding doubles) as well as places that deserve attention for possible code quality improvements. - Currently only the "cortex-a9" and "cortex-15" CPU targets are supported; LLVM supports many other ARM CPUs and ispc should provide access to all of the ones that have NEON support (and aren't too obscure.) - ~5 of the reduce-* tests hit an assertion inside LLVM (unfortunately only when the compiler runs on an ARM host, though). - The Windows build hasn't been tested (though I've tried to update ispc.vcxproj appropriately). It may just work, but will more likely have various small issues.) - Anything related to 64-bit ARM has seen no attention.
2013-07-19 11:06:11 -07:00
parent b007bba59f
commit d7b0c5794e
22 changed files with 914 additions and 67 deletions
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -85,7 +85,7 @@ Module *m;
 ///////////////////////////////////////////////////////////////////////////
 // Target

-#ifndef ISPC_IS_WINDOWS
+#if !defined(ISPC_IS_WINDOWS) && !defined(__arm__)
 static void __cpuid(int info[4], int infoType) {
    __asm__ __volatile__ ("cpuid"
                          : "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3])
@@ -100,11 +100,14 @@ static void __cpuidex(int info[4], int level, int count) {
                        : "=a" (info[0]), "=r" (info[1]), "=c" (info[2]), "=d" (info[3])
                        : "0" (level), "2" (count));
 }
-#endif // ISPC_IS_WINDOWS
+#endif // !ISPC_IS_WINDOWS && !__ARM__


 static const char *
 lGetSystemISA() {
+#ifdef __arm__
+    return "neon";
+#else
    int info[4];
    __cpuid(info, 1);

@@ -133,10 +136,15 @@ lGetSystemISA() {
        fprintf(stderr, "Unable to detect supported SSE/AVX ISA.  Exiting.\n");
        exit(1);
    }
+#endif
 }


 static const char *supportedCPUs[] = {
+    // FIXME: LLVM supports a ton of different ARM CPU variants--not just
+    // cortex-a9 and a15.  We should be able to handle any of them that also
+    // have NEON support.
+    "cortex-a9", "cortex-a15",
    "atom", "penryn", "core2", "corei7", "corei7-avx"
 #if !defined(LLVM_3_1)
    , "core-avx-i", "core-avx2"
@@ -177,6 +185,9 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            // possible ISA based on that.
            if (!strcmp(cpu, "core-avx2"))
                isa = "avx2";
+            else if (!strcmp(cpu, "cortex-a9") ||
+                     !strcmp(cpu, "cortex-a15"))
+                isa = "neon";
            else if (!strcmp(cpu, "core-avx-i"))
                isa = "avx1.1";
            else if (!strcmp(cpu, "sandybridge") ||
@@ -200,6 +211,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        }
    }

+#if !defined(__arm__)
+    if (cpu == NULL && !strcmp(isa, "neon"))
+        // If we're compiling NEON on an x86 host and the CPU wasn't
+        // supplied, don't go and set the CPU based on the host...
+        cpu = "cortex-a9";
+#endif
+
    if (cpu == NULL) {
        std::string hostCPU = llvm::sys::getHostCPUName();
        if (hostCPU.size() > 0)
@@ -227,8 +245,12 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :

    this->m_cpu = cpu;

-    if (arch == NULL)
-        arch = "x86-64";
+    if (arch == NULL) {
+        if (!strcmp(isa, "neon"))
+            arch = "arm";
+        else
+            arch = "x86-64";
+    }

    bool error = false;

@@ -423,6 +445,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_hasGather = true;
 #endif
    }
+    else if (!strcasecmp(isa, "neon")) {
+        this->m_isa = Target::NEON;
+        this->m_nativeVectorWidth = 4;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
    else {
        fprintf(stderr, "Target ISA \"%s\" is unknown.  Choices are: %s\n",
                isa, SupportedTargetISAs());
@@ -437,6 +468,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
            llvm::Reloc::Default;
        std::string featuresString = m_attributes;
        llvm::TargetOptions options;
+        if (m_isa == Target::NEON)
+            options.FloatABIType = llvm::FloatABI::Hard;
 #if !defined(LLVM_3_1)
        if (g->opt.disableFMA == false)
            options.AllowFPOpFusion = llvm::FPOpFusion::Fast;
@@ -528,13 +561,13 @@ Target::SupportedTargetCPUs() {

 const char *
 Target::SupportedTargetArchs() {
-    return "x86, x86-64";
+    return "arm, x86, x86-64";
 }


 const char *
 Target::SupportedTargetISAs() {
-    return "sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
+    return "neon, sse2, sse2-x2, sse4, sse4-x2, avx, avx-x2"
        ", avx1.1, avx1.1-x2, avx2, avx2-x2"
        ", generic-1, generic-4, generic-8, generic-16, generic-32";
 }
@@ -565,6 +598,8 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
    switch (isa) {
+    case Target::NEON:
+        return "neon";
    case Target::SSE2:
        return "sse2";
    case Target::SSE4: