Add support for ARM NEON targets.
Initial support for ARM NEON on Cortex-A9 and A15 CPUs. All but ~10 tests
pass, and all examples compile and run correctly. Most of the examples
show a ~2x speedup on a single A15 core versus scalar code.
Current open issues/TODOs
- Code quality looks decent, but hasn't been carefully examined. Known
issues/opportunities for improvement include:
- fp32 vector divide is done as a series of scalar divides rather than
a vector divide (which I believe exists, but I may be mistaken.)
This is particularly harmful to examples/rt, which only runs ~1.5x
faster with ispc, likely due to long chains of scalar divides.
- The compiler isn't generating a vmin.f32 for e.g. the final scalar
min in reduce_min(); instead it's generating a compare and then a
select instruction (and similarly elsewhere).
- There are some additional FIXMEs in builtins/target-neon.ll that
include both a few pieces of missing functionality (e.g. rounding
doubles) as well as places that deserve attention for possible
code quality improvements.
- Currently only the "cortex-a9" and "cortex-15" CPU targets are
supported; LLVM supports many other ARM CPUs and ispc should provide
access to all of the ones that have NEON support (and aren't too
obscure.)
- ~5 of the reduce-* tests hit an assertion inside LLVM (unfortunately
only when the compiler runs on an ARM host, though).
- The Windows build hasn't been tested (though I've tried to update
ispc.vcxproj appropriately). It may just work, but will more likely
have various small issues.)
- Anything related to 64-bit ARM has seen no attention.
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=ao
|
||||
CPP_SRC=ao.cpp ao_serial.cpp
|
||||
ISPC_SRC=ao.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
ISPC_IA_TARGETS=sse2,sse4,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -4,16 +4,30 @@ TASK_LIB=-lpthread
|
||||
TASK_OBJ=objs/tasksys.o
|
||||
|
||||
CXX=g++
|
||||
CXXFLAGS=-Iobjs/ -O2 -m64
|
||||
CXXFLAGS=-Iobjs/ -O2
|
||||
CC=gcc
|
||||
CCFLAGS=-Iobjs/ -O2 -m64
|
||||
CCFLAGS=-Iobjs/ -O2
|
||||
|
||||
LIBS=-lm $(TASK_LIB) -lstdc++
|
||||
ISPC=ispc -O2 --arch=x86-64 $(ISPC_FLAGS)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
|
||||
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
ISPC=ispc -O2 $(ISPC_FLAGS)
|
||||
ISPC_HEADER=objs/$(ISPC_SRC:.ispc=_ispc.h)
|
||||
|
||||
ARCH:=$(shell uname -m | sed -e s/x86_64/x86/ -e s/arm.*/arm/ -e s/sa110/arm/)
|
||||
|
||||
ifeq ($(ARCH),x86)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=)_ispc.o $(ISPC_SRC:.ispc=)_ispc_sse2.o \
|
||||
$(ISPC_SRC:.ispc=)_ispc_sse4.o $(ISPC_SRC:.ispc=)_ispc_avx.o)
|
||||
ISPC_TARGETS=$(ISPC_IA_TARGETS)
|
||||
ISPC_FLAGS += --arch=x86-64
|
||||
CXXFLAGS += -m64
|
||||
CCFLAGS += -m64
|
||||
else ifeq ($(ARCH),arm)
|
||||
ISPC_OBJS=$(addprefix objs/, $(ISPC_SRC:.ispc=_ispc.o))
|
||||
ISPC_TARGETS=$(ISPC_ARM_TARGETS)
|
||||
else
|
||||
$(error Unknown architecture $(ARCH) from uname -m)
|
||||
endif
|
||||
|
||||
CPP_OBJS=$(addprefix objs/, $(CPP_SRC:.cpp=.o))
|
||||
CC_OBJS=$(addprefix objs/, $(CC_SRC:.c=.o))
|
||||
OBJS=$(CPP_OBJS) $(CC_OBJS) $(TASK_OBJ) $(ISPC_OBJS)
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
EXAMPLE=deferred_shading
|
||||
CPP_SRC=common.cpp main.cpp dynamic_c.cpp dynamic_cilk.cpp
|
||||
ISPC_SRC=kernels.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
ISPC_FLAGS=--opt=fast-math
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -3,6 +3,7 @@ EXAMPLE=gmres
|
||||
CPP_SRC=algorithm.cpp main.cpp matrix.cpp
|
||||
CC_SRC=mmio.c
|
||||
ISPC_SRC=matrix.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=mandelbrot
|
||||
CPP_SRC=mandelbrot.cpp mandelbrot_serial.cpp
|
||||
ISPC_SRC=mandelbrot.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=noise
|
||||
CPP_SRC=$(EXAMPLE).cpp $(EXAMPLE)_serial.cpp
|
||||
ISPC_SRC=noise.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=options
|
||||
CPP_SRC=options.cpp options_serial.cpp
|
||||
ISPC_SRC=options.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=perbench
|
||||
CPP_SRC=perfbench.cpp perfbench_serial.cpp
|
||||
ISPC_SRC=perfbench.ispc
|
||||
ISPC_TARGETS=sse2,sse4,avx
|
||||
ISPC_IA_TARGETS=sse2,sse4,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=rt
|
||||
CPP_SRC=rt.cpp rt_serial.cpp
|
||||
ISPC_SRC=rt.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
EXAMPLE=sort
|
||||
CPP_SRC=sort.cpp sort_serial.cpp
|
||||
ISPC_SRC=sort.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
#ISPC_FLAGS=-DDEBUG
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=stencil
|
||||
CPP_SRC=stencil.cpp stencil_serial.cpp
|
||||
ISPC_SRC=stencil.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx-x2
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
@@ -33,33 +33,47 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __arm__
|
||||
#include <sys/time.h>
|
||||
// There's no easy way to get a hardware clock counter on ARM, so instead
|
||||
// we'll pretend it's a 1GHz processor and then compute pretend cycles
|
||||
// based on elapsed time from gettimeofday().
|
||||
__inline__ uint64_t rdtsc() {
|
||||
static bool first = true;
|
||||
static struct timeval tv_start;
|
||||
if (first) {
|
||||
gettimeofday(&tv_start, NULL);
|
||||
first = false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
tv.tv_sec -= tv_start.tv_sec;
|
||||
tv.tv_usec -= tv_start.tv_usec;
|
||||
return (1000000ull * tv.tv_sec + tv.tv_usec) * 1000ull;
|
||||
}
|
||||
|
||||
#else // __arm__
|
||||
|
||||
#ifdef WIN32
|
||||
#include <windows.h>
|
||||
#define rdtsc __rdtsc
|
||||
#else
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif /* __cplusplus */
|
||||
__inline__ uint64_t rdtsc() {
|
||||
uint32_t low, high;
|
||||
#else // WIN32
|
||||
__inline__ uint64_t rdtsc() {
|
||||
uint32_t low, high;
|
||||
#ifdef __x86_64
|
||||
__asm__ __volatile__ (
|
||||
"xorl %%eax,%%eax \n cpuid"
|
||||
::: "%rax", "%rbx", "%rcx", "%rdx" );
|
||||
__asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid"
|
||||
::: "%rax", "%rbx", "%rcx", "%rdx" );
|
||||
#else
|
||||
__asm__ __volatile__ (
|
||||
"xorl %%eax,%%eax \n cpuid"
|
||||
::: "%eax", "%ebx", "%ecx", "%edx" );
|
||||
__asm__ __volatile__ ("xorl %%eax,%%eax \n cpuid"
|
||||
::: "%eax", "%ebx", "%ecx", "%edx" );
|
||||
#endif
|
||||
__asm__ __volatile__ (
|
||||
"rdtsc" : "=a" (low), "=d" (high));
|
||||
return (uint64_t)high << 32 | low;
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
__asm__ __volatile__ ("rdtsc" : "=a" (low), "=d" (high));
|
||||
return (uint64_t)high << 32 | low;
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
#endif
|
||||
#endif // !WIN32
|
||||
#endif // !__arm__
|
||||
|
||||
static uint64_t start, end;
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
EXAMPLE=volume
|
||||
CPP_SRC=volume.cpp volume_serial.cpp
|
||||
ISPC_SRC=volume.ispc
|
||||
ISPC_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_IA_TARGETS=sse2,sse4-x2,avx
|
||||
ISPC_ARM_TARGETS=neon
|
||||
|
||||
include ../common.mk
|
||||
|
||||
Reference in New Issue
Block a user