diff --git a/Makefile b/Makefile
index fc064dbd..98729bfc 100644
--- a/Makefile
+++ b/Makefile
@@ -122,8 +122,10 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=neon avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 sse2 sse2-x2 sse4 sse4-x2 \
-	sse4-8 sse4-16 generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
+TARGETS=neon-32 neon-16 neon-8 \
+	avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+	sse2 sse2-x2 sse4 sse4-x2 sse4-8 sse4-16 \
+	generic-1 generic-4 generic-8 generic-16 generic-32 generic-64
 # These files need to be compiled in two versions - 32 and 64 bits.
 BUILTINS_SRC_TARGET=$(addprefix builtins/target-, $(addsuffix .ll, $(TARGETS)))
 # These are files to be compiled in single version.
diff --git a/builtins.cpp b/builtins.cpp
index 08472623..e671a491 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -657,7 +657,9 @@ AddBitcodeToModule(const unsigned char *bitcode, int length,
         // the values for an ARM target.  This maybe won't cause problems
         // in the generated code, since bulitins.c doesn't do anything too
         // complex w.r.t. struct layouts, etc.
-        if (g->target->getISA() != Target::NEON)
+        if (g->target->getISA() != Target::NEON32 &&
+            g->target->getISA() != Target::NEON16 &&
+            g->target->getISA() != Target::NEON8)
 #endif // !__arm__
         {
             Assert(bcTriple.getArch() == llvm::Triple::UnknownArch ||
@@ -820,12 +822,30 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     // Next, add the target's custom implementations of the various needed
     // builtin functions (e.g. __masked_store_32(), etc).
     switch (g->target->getISA()) {
-    case Target::NEON: {
+    case Target::NEON8: {
         if (runtime32) {
-            EXPORT_MODULE(builtins_bitcode_neon_32bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_32bit);
         }
         else {
-            EXPORT_MODULE(builtins_bitcode_neon_64bit);
+            EXPORT_MODULE(builtins_bitcode_neon_8_64bit);
+        }
+        break;
+    }
+    case Target::NEON16: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_16_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_16_64bit);
+        }
+        break;
+    }
+    case Target::NEON32: {
+        if (runtime32) {
+            EXPORT_MODULE(builtins_bitcode_neon_32_32bit);
+        }
+        else {
+            EXPORT_MODULE(builtins_bitcode_neon_32_64bit);
         }
         break;
     }
diff --git a/builtins/target-neon-16.ll b/builtins/target-neon-16.ll
new file mode 100644
index 00000000..fd15eb0b
--- /dev/null
+++ b/builtins/target-neon-16.ll
@@ -0,0 +1,458 @@
+;;
+;; target-neon-16.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`8')
+define(`MASK',`i16')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <8 x float> @__half_to_float_varying(<8 x i16> %v) nounwind readnone {
+  unary4to8conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <8 x float> %r
+}
+
+define <8 x i16> @__float_to_half_varying(<8 x float> %v) nounwind readnone {
+  unary4to8conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <8 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <8 x float> @__round_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <8 x float> %0 to <8 x i32>
+  %bitop.i.i = and <8 x i32> %float_to_int_bitcast.i.i.i.i,
+      <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+       i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <8 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <8 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <8 x float> %binop21.i to <8 x i32>
+  %bitop31.i = xor <8 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop31.i to <8 x float>
+  ret <8 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <8 x float> @__floor_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+define <8 x float> @__ceil_varying_float(<8 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <8 x float> @__round_varying_float(<8 x float> %0) nounwind
+  %bincmp.i = fcmp olt <8 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <8 x i1> %bincmp.i to <8 x i32>
+  %bitop.i = and <8 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <8 x i32> %bitop.i to <8 x float>
+  %binop.i = fadd <8 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <8 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to8(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to8(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to8(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to8(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to8(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rsqrt_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <8 x float> @__rcp_varying_float(<8 x float> %vs)
+  %r = extractelement <8 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to8(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <8 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to8(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i16> %0,
+    <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %and_mask)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %v = or i64 %va, %vb
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vor = or <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vor, i32 0
+  %v1 = extractelement <4 x MASK> %vor, i32 1
+  %v2 = extractelement <4 x MASK> %vor, i32 2
+  %v3 = extractelement <4 x MASK> %vor, i32 3
+  %v01 = or MASK %v0, %v1
+  %v23 = or MASK %v2, %v3
+  %v = or MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v8tov4(MASK, %0, %v0123, %v4567)
+  %vand = and <4 x MASK> %v0123, %v4567
+  %v0 = extractelement <4 x MASK> %vand, i32 0
+  %v1 = extractelement <4 x MASK> %vand, i32 1
+  %v2 = extractelement <4 x MASK> %vand, i32 2
+  %v3 = extractelement <4 x MASK> %vand, i32 3
+  %v01 = and MASK %v0, %v1
+  %v23 = and MASK %v2, %v3
+  %v = and MASK %v01, %v23
+  %cmp = icmp ne MASK %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v8tov4($1, %0, %v0123, %v4567)
+  %v0123_8 = shufflevector <4 x $1> %v0123, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %v4567_8 = shufflevector <4 x $1> %v4567, <4 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %vfirst = call <8 x $1> $2(<8 x $1> %v0123_8, <8 x $1> %v4567_8)
+  %vfirst_4 = shufflevector <8 x $1> %vfirst, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
+
+define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %0)
+  %a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
+  %a0 = extractelement <2 x i32> %a32, i32 0
+  %a1 = extractelement <2 x i32> %a32, i32 1
+  %r = add i32 %a0, %a1
+  %r16 = trunc i32 %r to i16
+  ret i16 %r16
+}
+
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16>)
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  %a1 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<WIDTH x i16> %0)
+  %a2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a1)
+  %aa = extractelement <2 x i64> %a2, i32 0
+  %ab = extractelement <2 x i64> %a2, i32 1
+  %r = add i64 %aa, %ab
+  ret i64 %r
+}
+
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v8tov4(i32, %0, %va, %vb)
+  %pa = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %pb = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %psum = add <2 x i64> %pa, %pb
+  %a0 = extractelement <2 x i64> %psum, i32 0
+  %a1 = extractelement <2 x i64> %psum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  v8tov2(double, %0, %v0, %v1, %v2, %v3)
+  %v01 = fadd <2 x double> %v0, %v1
+  %v23 = fadd <2 x double> %v2, %v3
+  %sum = fadd <2 x double> %v01, %v23
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  v8tov2(i64, %0, %v0, %v1, %v2, %v3)
+  %v01 = add <2 x i64> %v0, %v1
+  %v23 = add <2 x i64> %v2, %v3
+  %sum = add <2 x i64> %v01, %v23
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon.ll b/builtins/target-neon-32.ll
similarity index 62%
rename from builtins/target-neon.ll
rename to builtins/target-neon-32.ll
index fbeac352..1f8003d7 100644
--- a/builtins/target-neon.ll
+++ b/builtins/target-neon-32.ll
@@ -1,5 +1,5 @@
 ;;
-;; target-neon.ll
+;; target-neon-32.ll
 ;;
 ;;  Copyright(c) 2012-2013 Matt Pharr
 ;;  Copyright(c) 2013 Google, Inc.
@@ -34,52 +34,20 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
-
 define(`WIDTH',`4')
-
 define(`MASK',`i32')
 
 include(`util.m4')
-
-stdlib_core()
-scans()
-reduce_equal(WIDTH)
-rdrand_decls()
-define_shuffles()
-aossoa()
-ctlztz()
+include(`target-neon-common.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 
-declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
-declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
-
-define float @__half_to_float_uniform(i16 %v) nounwind readnone {
-  %v1 = bitcast i16 %v to <1 x i16>
-  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
-  %r = extractelement <4 x float> %h, i32 0
-  ret float %r
-}
-
 define <4 x float> @__half_to_float_varying(<4 x i16> %v) nounwind readnone {
   %r = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %v)
   ret <4 x float> %r
 }
 
-define i16 @__float_to_half_uniform(float %v) nounwind readnone {
-  %v1 = bitcast float %v to <1 x float>
-  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
-           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
-  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
-  %r = extractelement <4 x i16> %h, i32 0
-  ret i16 %r
-}
-
-
 define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
   %r = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %v)
   ret <4 x i16> %r
@@ -88,48 +56,11 @@ define <4 x i16> @__float_to_half_varying(<4 x float> %v) nounwind readnone {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; math
 
-define void @__fastmath() nounwind {
-  ret void
-}
-
 ;; round/floor/ceil
 
 ;; FIXME: grabbed these from the sse2 target, which does not have native
 ;; instructions for these.  Is there a better approach for NEON?
 
-define float @__round_uniform_float(float) nounwind readonly alwaysinline {
-  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
-  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
-  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
-  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
-  %binop21.i = fadd float %binop.i, -8.388608e+06
-  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
-  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
-  ret float %int_to_float_bitcast.i.i.i
-}
-
-define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp ogt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, -1082130432
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
-define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
-  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
-  %bincmp.i = fcmp olt float %calltmp.i, %0
-  %selectexpr.i = sext i1 %bincmp.i to i32
-  %bitop.i = and i32 %selectexpr.i, 1065353216
-  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
-  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
-  ret float %binop.i
-}
-
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
   %float_to_int_bitcast.i.i.i.i = bitcast <4 x float> %0 to <4 x i32>
   %bitop.i.i = and <4 x i32> %float_to_int_bitcast.i.i.i.i, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
@@ -164,10 +95,6 @@ define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysin
 }
 
 ;; FIXME: rounding doubles and double vectors needs to be implemented
-declare double @__round_uniform_double(double) nounwind readnone 
-declare double @__floor_uniform_double(double) nounwind readnone 
-declare double @__ceil_uniform_double(double) nounwind readnone 
-
 declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
 declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
@@ -175,78 +102,6 @@ declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readn
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
-define float @__max_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ugt float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define float @__min_uniform_float(float, float) nounwind readnone {
-  %cmp = fcmp ult float %0, %1
-  %r = select i1 %cmp, float %0, float %1
-  ret float %r
-}
-
-define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp slt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
-  %cmp = icmp sgt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ult i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
-  %cmp = icmp ugt i32 %0, %1
-  %r = select i1 %cmp, i32 %0, i32 %1
-  ret i32 %r
-}
-
-define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp slt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
-  %cmp = icmp sgt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ult i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
-  %cmp = icmp ugt i64 %0, %1
-  %r = select i1 %cmp, i64 %0, i64 %1
-  ret i64 %r
-}
-
-define double @__min_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp olt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
-define double @__max_uniform_double(double, double) nounwind readnone {
-  %cmp = fcmp ogt double %0, %1
-  %r = select i1 %cmp, double %0, double %1
-  ret double %r
-}
-
 declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
 
@@ -287,44 +142,6 @@ define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwin
   ret <4 x i32> %r
 }
 
-define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp slt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp sgt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ult <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
-  %m = icmp ugt <WIDTH x i64> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
-  ret <WIDTH x i64> %r
-}
-
-define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp olt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
-define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
-                                              <WIDTH x double>) nounwind readnone {
-  %m = fcmp ogt <WIDTH x double> %0, %1
-  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
-  ret <WIDTH x double> %r
-}
-
 ;; sqrt/rsqrt/rcp
 
 declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
@@ -371,13 +188,6 @@ define float @__rcp_uniform_float(float) nounwind readnone {
   ret float %r
 }
 
-declare float @llvm.sqrt.f32(float)
-
-define float @__sqrt_uniform_float(float) nounwind readnone {
-  %r = call float @llvm.sqrt.f32(float %0)
-  ret float %r
-}
-
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
@@ -388,13 +198,6 @@ define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone
   ret <4 x float> %result
 }
 
-declare double @llvm.sqrt.f64(double)
-
-define double @__sqrt_uniform_double(double) nounwind readnone {
-  %r = call double @llvm.sqrt.f64(double %0)
-  ret double %r
-}
-
 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
 
 define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
@@ -402,21 +205,6 @@ define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readno
   ret <4 x double> %r
 }
 
-;; bit ops
-
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-
-define i32 @__popcnt_int32(i32) nounwind readnone {
-  %v = call i32 @llvm.ctpop.i32(i32 %0)
-  ret i32 %v
-}
-
-define i64 @__popcnt_int64(i64) nounwind readnone {
-  %v = call i64 @llvm.ctpop.i64(i64 %0)
-  ret i64 %v
-}
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
 
@@ -638,92 +426,3 @@ define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
   reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; unaligned loads/loads+broadcasts
-
-masked_load(i8,  1)
-masked_load(i16, 2)
-masked_load(i32, 4)
-masked_load(float, 4)
-masked_load(i64, 8)
-masked_load(double, 8)
-
-gen_masked_store(i8)
-gen_masked_store(i16)
-gen_masked_store(i32)
-gen_masked_store(i64)
-masked_store_float_double()
-
-define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
-                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i8> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i8> %new, <4 x i8> %old
-  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i16> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i16> %new, <4 x i16> %old
-  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
-                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i32> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i32> %new, <4 x i32> %old
-  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
-  ret void
-}
-
-define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
-                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
-  %old = load <WIDTH x i64> * %ptr
-  %mask1 = trunc <4 x MASK> %mask to <4 x i1>
-  %result = select <4 x i1> %mask1, <4 x i64> %new, <4 x i64> %old
-  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
-  ret void
-}
-
-;; yuck.  We need declarations of these, even though we shouldnt ever
-;; actually generate calls to them for the NEON target...
-
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; gather
-
-gen_gather_factored(i8)
-gen_gather_factored(i16)
-gen_gather_factored(i32)
-gen_gather_factored(float)
-gen_gather_factored(i64)
-gen_gather_factored(double)
-
-gen_scatter(i8)
-gen_scatter(i16)
-gen_scatter(i32)
-gen_scatter(float)
-gen_scatter(i64)
-gen_scatter(double)
-
-packed_load_and_store(4)
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; prefetch
-
-define_prefetches()
diff --git a/builtins/target-neon-8.ll b/builtins/target-neon-8.ll
new file mode 100644
index 00000000..eb65f224
--- /dev/null
+++ b/builtins/target-neon-8.ll
@@ -0,0 +1,508 @@
+;;
+;; target-neon-8.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+define(`WIDTH',`16')
+define(`MASK',`i8')
+
+include(`util.m4')
+include(`target-neon-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+define <16 x float> @__half_to_float_varying(<16 x i16> %v) nounwind readnone {
+  unary4to16conv(r, i16, float, @llvm.arm.neon.vcvthf2fp, %v)
+  ret <16 x float> %r
+}
+
+define <16 x i16> @__float_to_half_varying(<16 x float> %v) nounwind readnone {
+  unary4to16conv(r, float, i16, @llvm.arm.neon.vcvtfp2hf, %v)
+  ret <16 x i16> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define <16 x float> @__round_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast <16 x float> %0 to <16 x i32>
+  %bitop.i.i = and <16 x i32> %float_to_int_bitcast.i.i.i.i,
+    <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648,
+     i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+  %bitop.i = xor <16 x i32> %float_to_int_bitcast.i.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %int_to_float_bitcast.i.i40.i,
+    <float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06,
+     float 8.388608e+06, float 8.388608e+06, float 8.388608e+06, float 8.388608e+06>
+  %binop21.i = fadd <16 x float> %binop.i,
+    <float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06,
+     float -8.388608e+06, float -8.388608e+06, float -8.388608e+06, float -8.388608e+06>
+  %float_to_int_bitcast.i.i.i = bitcast <16 x float> %binop21.i to <16 x i32>
+  %bitop31.i = xor <16 x i32> %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop31.i to <16 x float>
+  ret <16 x float> %int_to_float_bitcast.i.i.i
+}
+
+define <16 x float> @__floor_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp ogt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432,
+     i32 -1082130432, i32 -1082130432, i32 -1082130432, i32 -1082130432>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+define <16 x float> @__ceil_varying_float(<16 x float>) nounwind readonly alwaysinline {
+  %calltmp.i = tail call <16 x float> @__round_varying_float(<16 x float> %0) nounwind
+  %bincmp.i = fcmp olt <16 x float> %calltmp.i, %0
+  %val_to_boolvec32.i = sext <16 x i1> %bincmp.i to <16 x i32>
+  %bitop.i = and <16 x i32> %val_to_boolvec32.i,
+    <i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216,
+     i32 1065353216, i32 1065353216, i32 1065353216, i32 1065353216>
+  %int_to_float_bitcast.i.i.i = bitcast <16 x i32> %bitop.i to <16 x float>
+  %binop.i = fadd <16 x float> %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret <16 x float> %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare <WIDTH x double> @__round_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__floor_varying_double(<WIDTH x double>) nounwind readnone 
+declare <WIDTH x double> @__ceil_varying_double(<WIDTH x double>) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__max_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmaxs.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+define <WIDTH x float> @__min_varying_float(<WIDTH x float>,
+                                            <WIDTH x float>) nounwind readnone {
+  binary4to16(r, float, @llvm.arm.neon.vmins.v4f32, %0, %1)
+  ret <WIDTH x float> %r
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <WIDTH x i32> @__min_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmins.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_int32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxs.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__min_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vminu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+define <WIDTH x i32> @__max_varying_uint32(<WIDTH x i32>, <WIDTH x i32>) nounwind readnone {
+  binary4to16(r, i32, @llvm.arm.neon.vmaxu.v4i32, %0, %1)
+  ret <WIDTH x i32> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rcp_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrecpe.v4f32, %d)
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x0)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrecps.v4f32, %d, %x1)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+
+define <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %d) nounwind readnone {
+  unary4to16(x0, float, @llvm.arm.neon.vrsqrte.v4f32, %d)
+  %x0_2 = fmul <WIDTH x float> %x0, %x0
+  binary4to16(x0_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x0_2)
+  %x1 = fmul <WIDTH x float> %x0, %x0_nr
+  %x1_2 = fmul <WIDTH x float> %x1, %x1
+  binary4to16(x1_nr, float, @llvm.arm.neon.vrsqrts.v4f32, %d, %x1_2)
+  %x2 = fmul <WIDTH x float> %x1, %x1_nr
+  ret <WIDTH x float> %x2
+}
+
+define float @__rsqrt_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rsqrt_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+define float @__rcp_uniform_float(float) nounwind readnone {
+  %v1 = bitcast float %0 to <1 x float>
+  %vs = shufflevector <1 x float> %v1, <1 x float> undef,
+          <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef,
+                      i32 undef, i32 undef, i32 undef, i32 undef>
+  %vr = call <16 x float> @__rcp_varying_float(<16 x float> %vs)
+  %r = extractelement <16 x float> %vr, i32 0
+  ret float %r
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <WIDTH x float> @__sqrt_varying_float(<WIDTH x float>) nounwind readnone {
+  unary4to16(result, float, @llvm.sqrt.v4f32, %0)
+;; this returns nan for v=0, which is undesirable..
+;;  %rsqrt = call <WIDTH x float> @__rsqrt_varying_float(<WIDTH x float> %0)
+;;  %result = fmul <4 x float> %rsqrt, %0
+  ret <16 x float> %result
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <WIDTH x double> @__sqrt_varying_double(<WIDTH x double>) nounwind readnone {
+  unary4to16(r, double, @llvm.sqrt.v4f64, %0)
+  ret <WIDTH x double> %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; reductions
+
+define i64 @__movmsk(<WIDTH x MASK>) nounwind readnone {
+  %and_mask = and <WIDTH x i8> %0,
+    <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128,
+     i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+  %v8 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %and_mask)
+  %v4 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %v8)
+  %v2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %v4)
+  %va = extractelement <2 x i64> %v2, i32 0
+  %vb = extractelement <2 x i64> %v2, i32 1
+  %vbshift = shl i64 %vb, 8
+  %v = or i64 %va, %vbshift
+  ret i64 %v
+}
+
+define i1 @__any(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vor8 = or <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vor8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vor16 = or <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vor16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vor32 = or <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vor32, i32 0
+  %v1 = extractelement <2 x i32> %vor32, i32 1
+  %v = or i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  v16tov8(MASK, %0, %v8a, %v8b)
+  %vand8 = and <8 x MASK> %v8a, %v8b
+  %v16 = sext <8 x i8> %vand8 to <8 x i16>
+  v8tov4(i16, %v16, %v16a, %v16b)
+  %vand16 = and <4 x i16> %v16a, %v16b
+  %v32 = sext <4 x i16> %vand16 to <4 x i32>
+  v4tov2(i32, %v32, %v32a, %v32b)
+  %vand32 = and <2 x i32> %v32a, %v32b
+  %v0 = extractelement <2 x i32> %vand32, i32 0
+  %v1 = extractelement <2 x i32> %vand32, i32 1
+  %v = and i32 %v0, %v1
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__none(<WIDTH x MASK>) nounwind readnone alwaysinline {
+  %any = call i1 @__any(<WIDTH x MASK> %0)
+  %none = icmp eq i1 %any, 0
+  ret i1 %none
+}
+
+;; $1: scalar type
+;; $2: vector/vector reduce function (2 x <WIDTH x vec> -> <WIDTH x vec>)
+;; $3: pairwise vector reduce function (2 x <2 x vec> -> <2 x vec>)
+;; $4: scalar reduce function
+
+define(`neon_reduce', `
+  v16tov8($1, %0, %va, %vb)
+  %va_16 = shufflevector <8 x $1> %va, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %vb_16 = shufflevector <8 x $1> %vb, <8 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8 = call <16 x $1> $2(<16 x $1> %va_16, <16 x $1> %vb_16)
+
+  %v8a = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 0, i32 1, i32 2, i32 3, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+  %v8b = shufflevector <16 x $1> %v8, <16 x $1> undef,
+    <16 x i32> <i32 4, i32 5, i32 6, i32 7, 
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef,
+                i32 undef, i32 undef, i32 undef, i32 undef>
+
+  %v4 = call <16 x $1> $2(<16 x $1> %v8a, <16 x $1> %v8b)
+
+  %vfirst_4 = shufflevector <16 x $1> %v4, <16 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  v4tov2($1, %vfirst_4, %v0, %v1)
+  %vh = call <2 x $1> $3(<2 x $1> %v0, <2 x $1> %v1)
+  %vh0 = extractelement <2 x $1> %vh, i32 0
+  %vh1 = extractelement <2 x $1> %vh, i32 1
+  %r = call $1 $4($1 %vh0, $1 %vh1)
+  ret $1 %r
+')
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @add_f32(float, float) {
+  %r = fadd float %0, %1
+  ret float %r
+}
+
+define internal <WIDTH x float> @__add_varying_float(<WIDTH x float>, <WIDTH x float>) {
+  %r = fadd <WIDTH x float> %0, %1
+  ret <WIDTH x float> %r
+}
+
+define float @__reduce_add_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__add_varying_float, @llvm.arm.neon.vpadd.v2f32, @add_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @min_f32(float, float) {
+  %cmp = fcmp olt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_min_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__min_varying_float, @llvm.arm.neon.vpmins.v2f32, @min_f32)
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+define internal float @max_f32(float, float) {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__reduce_max_float(<WIDTH x float>) nounwind readnone {
+  neon_reduce(float, @__max_varying_float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
+}
+
+declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define i64 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
+  %a16 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %0)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %a16)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %a0 = extractelement <2 x i64> %a64, i32 0
+  %a1 = extractelement <2 x i64> %a64, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
+  v16tov8(i16, %0, %va, %vb)
+  %a32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %va)
+  %b32 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %vb)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %a32)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %b32)
+  %sum = add <2 x i64> %a64, %b64
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
+  v16tov4(i32, %0, %va, %vb, %vc, %vd)
+  %a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %va)
+  %b64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vb)
+  %c64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vc)
+  %d64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %vd)
+  %ab = add <2 x i64> %a64, %b64
+  %cd = add <2 x i64> %c64, %d64
+  %sum = add <2 x i64> %ab, %cd
+  %a0 = extractelement <2 x i64> %sum, i32 0
+  %a1 = extractelement <2 x i64> %sum, i32 1
+  %r = add i64 %a0, %a1
+  ret i64 %r
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_si32(i32, i32) {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_int32, @llvm.arm.neon.vpmins.v2i32, @min_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_si32(i32, i32) {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_int32, @llvm.arm.neon.vpmaxs.v2i32, @max_si32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @min_ui32(i32, i32) {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__min_varying_uint32, @llvm.arm.neon.vpmins.v2i32, @min_ui32)
+}
+
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+define internal i32 @max_ui32(i32, i32) {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone {
+  neon_reduce(i32, @__max_varying_uint32, @llvm.arm.neon.vpmaxs.v2i32, @max_ui32)
+}
+
+define internal double @__add_uniform_double(double, double) {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal <WIDTH x double> @__add_varying_double(<WIDTH x double>, <WIDTH x double>) {
+  %r = fadd <WIDTH x double> %0, %1
+  ret <WIDTH x double> %r
+}
+
+define double @__reduce_add_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define double @__reduce_min_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define double @__reduce_max_double(<WIDTH x double>) nounwind readnone {
+  reduce16(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal <WIDTH x i64> @__add_varying_int64(<WIDTH x i64>, <WIDTH x i64>) {
+  %r = add <WIDTH x i64> %0, %1
+  ret <WIDTH x i64> %r
+}
+
+define i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone {
+  reduce16(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
new file mode 100644
index 00000000..696b0748
--- /dev/null
+++ b/builtins/target-neon-common.ll
@@ -0,0 +1,351 @@
+;;
+;; target-neon-common.ll
+;;
+;;  Copyright(c) 2013 Google, Inc.
+;;
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Matt Pharr nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+target datalayout = "e-p:32:32:32-S32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f16:16:16-f32:32:32-f64:32:64-f128:128:128-v64:32:64-v128:32:128-a0:0:64-n32"
+
+stdlib_core()
+scans()
+reduce_equal(WIDTH)
+rdrand_decls()
+define_shuffles()
+aossoa()
+ctlztz()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+declare <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+
+define float @__half_to_float_uniform(i16 %v) nounwind readnone {
+  %v1 = bitcast i16 %v to <1 x i16>
+  %vec = shufflevector <1 x i16> %v1, <1 x i16> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> %vec)
+  %r = extractelement <4 x float> %h, i32 0
+  ret float %r
+}
+
+define i16 @__float_to_half_uniform(float %v) nounwind readnone {
+  %v1 = bitcast float %v to <1 x float>
+  %vec = shufflevector <1 x float> %v1, <1 x float> undef, 
+           <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  %h = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> %vec)
+  %r = extractelement <4 x i16> %h, i32 0
+  ret i16 %r
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; math
+
+define void @__fastmath() nounwind {
+  ret void
+}
+
+;; round/floor/ceil
+
+;; FIXME: grabbed these from the sse2 target, which does not have native
+;; instructions for these.  Is there a better approach for NEON?
+
+define float @__round_uniform_float(float) nounwind readonly alwaysinline {
+  %float_to_int_bitcast.i.i.i.i = bitcast float %0 to i32
+  %bitop.i.i = and i32 %float_to_int_bitcast.i.i.i.i, -2147483648
+  %bitop.i = xor i32 %bitop.i.i, %float_to_int_bitcast.i.i.i.i
+  %int_to_float_bitcast.i.i40.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %int_to_float_bitcast.i.i40.i, 8.388608e+06
+  %binop21.i = fadd float %binop.i, -8.388608e+06
+  %float_to_int_bitcast.i.i.i = bitcast float %binop21.i to i32
+  %bitop31.i = xor i32 %float_to_int_bitcast.i.i.i, %bitop.i.i
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop31.i to float
+  ret float %int_to_float_bitcast.i.i.i
+}
+
+define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp ogt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, -1082130432
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+define float @__ceil_uniform_float(float) nounwind readonly alwaysinline {
+  %calltmp.i = tail call float @__round_uniform_float(float %0) nounwind
+  %bincmp.i = fcmp olt float %calltmp.i, %0
+  %selectexpr.i = sext i1 %bincmp.i to i32
+  %bitop.i = and i32 %selectexpr.i, 1065353216
+  %int_to_float_bitcast.i.i.i = bitcast i32 %bitop.i to float
+  %binop.i = fadd float %calltmp.i, %int_to_float_bitcast.i.i.i
+  ret float %binop.i
+}
+
+;; FIXME: rounding doubles and double vectors needs to be implemented
+declare double @__round_uniform_double(double) nounwind readnone 
+declare double @__floor_uniform_double(double) nounwind readnone 
+declare double @__ceil_uniform_double(double) nounwind readnone 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; min/max
+
+define float @__max_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ugt float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define float @__min_uniform_float(float, float) nounwind readnone {
+  %cmp = fcmp ult float %0, %1
+  %r = select i1 %cmp, float %0, float %1
+  ret float %r
+}
+
+define i32 @__min_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp slt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_int32(i32, i32) nounwind readnone {
+  %cmp = icmp sgt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__min_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ult i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i32 @__max_uniform_uint32(i32, i32) nounwind readnone {
+  %cmp = icmp ugt i32 %0, %1
+  %r = select i1 %cmp, i32 %0, i32 %1
+  ret i32 %r
+}
+
+define i64 @__min_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp slt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_int64(i64, i64) nounwind readnone {
+  %cmp = icmp sgt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__min_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ult i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define i64 @__max_uniform_uint64(i64, i64) nounwind readnone {
+  %cmp = icmp ugt i64 %0, %1
+  %r = select i1 %cmp, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define double @__min_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp olt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define double @__max_uniform_double(double, double) nounwind readnone {
+  %cmp = fcmp ogt double %0, %1
+  %r = select i1 %cmp, double %0, double %1
+  ret double %r
+}
+
+define <WIDTH x i64> @__min_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp slt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_int64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp sgt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__min_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ult <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x i64> @__max_varying_uint64(<WIDTH x i64>, <WIDTH x i64>) nounwind readnone {
+  %m = icmp ugt <WIDTH x i64> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x i64> %0, <WIDTH x i64> %1
+  ret <WIDTH x i64> %r
+}
+
+define <WIDTH x double> @__min_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp olt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+define <WIDTH x double> @__max_varying_double(<WIDTH x double>,
+                                              <WIDTH x double>) nounwind readnone {
+  %m = fcmp ogt <WIDTH x double> %0, %1
+  %r = select <WIDTH x i1> %m, <WIDTH x double> %0, <WIDTH x double> %1
+  ret <WIDTH x double> %r
+}
+
+;; sqrt/rsqrt/rcp
+
+declare float @llvm.sqrt.f32(float)
+
+define float @__sqrt_uniform_float(float) nounwind readnone {
+  %r = call float @llvm.sqrt.f32(float %0)
+  ret float %r
+}
+
+declare double @llvm.sqrt.f64(double)
+
+define double @__sqrt_uniform_double(double) nounwind readnone {
+  %r = call double @llvm.sqrt.f64(double %0)
+  ret double %r
+}
+
+;; bit ops
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define i32 @__popcnt_int32(i32) nounwind readnone {
+  %v = call i32 @llvm.ctpop.i32(i32 %0)
+  ret i32 %v
+}
+
+define i64 @__popcnt_int64(i64) nounwind readnone {
+  %v = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %v
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+masked_load(i8,  1)
+masked_load(i16, 2)
+masked_load(i32, 4)
+masked_load(float, 4)
+masked_load(i64, 8)
+masked_load(double, 8)
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+gen_masked_store(i32)
+gen_masked_store(i64)
+masked_store_float_double()
+
+define void @__masked_store_blend_i8(<WIDTH x i8>* nocapture %ptr, <WIDTH x i8> %new,
+                                     <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i8> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i8> %new, <WIDTH x i8> %old
+  store <WIDTH x i8> %result, <WIDTH x i8> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i16(<WIDTH x i16>* nocapture %ptr, <WIDTH x i16> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i16> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i16> %new, <WIDTH x i16> %old
+  store <WIDTH x i16> %result, <WIDTH x i16> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i32(<WIDTH x i32>* nocapture %ptr, <WIDTH x i32> %new, 
+                                      <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i32> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i32> %new, <WIDTH x i32> %old
+  store <WIDTH x i32> %result, <WIDTH x i32> * %ptr
+  ret void
+}
+
+define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
+                            <WIDTH x i64> %new, <WIDTH x MASK> %mask) nounwind alwaysinline {
+  %old = load <WIDTH x i64> * %ptr
+  %mask1 = trunc <WIDTH x MASK> %mask to <WIDTH x i1>
+  %result = select <WIDTH x i1> %mask1, <WIDTH x i64> %new, <WIDTH x i64> %old
+  store <WIDTH x i64> %result, <WIDTH x i64> * %ptr
+  ret void
+}
+
+;; yuck.  We need declarations of these, even though we shouldnt ever
+;; actually generate calls to them for the NEON target...
+
+declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
+declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
+declare <WIDTH x float> @__svml_log(<WIDTH x float>)
+declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+packed_load_and_store(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; prefetch
+
+define_prefetches()
diff --git a/builtins/util.m4 b/builtins/util.m4
index ee45ebc7..1f85e2cc 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -49,6 +49,53 @@ define(`MASK_HIGH_BIT_ON',
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+;; vector assembly and deconstruction utilities
+;; split 8-wide vector into 2 4-wide vectors
+;; 
+;; $1: vector element type
+;; $2: 8-wide vector
+;; $3: first 4-wide vector
+;; $4: second 4-wide vector
+
+define(`v8tov4', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef,
+    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+')
+
+define(`v16tov8', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef,
+    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+')
+
+;; 4-wide into 2 2-wide
+;; args as above
+;;
+
+define(`v4tov2', `
+  $3 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <4 x $1> $2, <4 x $1> undef, <2 x i32> <i32 2, i32 3>
+')
+
+define(`v8tov2', `
+  $3 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 0, i32 1>
+  $4 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 2, i32 3>
+  $5 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 4, i32 5>
+  $6 = shufflevector <8 x $1> $2, <8 x $1> undef, <2 x i32> <i32 6, i32 7>
+')
+
+define(`v16tov4', `
+  $3 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  $4 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  $5 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  $6 = shufflevector <16 x $1> $2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
 ;; Helper macro for calling various SSE instructions for scalar values
 ;; but where the instruction takes a vector parameter.
 ;; $1 : name of variable to put the final value in
@@ -156,10 +203,7 @@ define(`reduce16', `
 ;;     the final reduction
 
 define(`reduce8by4', `
-  %v1 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v2 = shufflevector <8 x $1> %0, <8 x $1> undef,
-        <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  v8tov4($1, %0, %v1, %v2)
   %m1 = call <4 x $1> $2(<4 x $1> %v1, <4 x $1> %v2)
   %v3 = shufflevector <4 x $1> %m1, <4 x $1> undef,
         <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@@ -266,30 +310,66 @@ define(`binary2to4', `
 ;; $4: 8-wide operand value
 
 define(`unary4to8', `
-  %$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1 = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1_0 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <8 x $2> $4, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %$1 = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+'
+)
+
+;; $1: name of variable into which the final result should go
+;; $2: scalar type of the input vector elements
+;; $3: scalar type of the result vector elements
+;; $4: 4-wide unary vector function to apply
+;; $5: 8-wide operand value
+
+define(`unary4to8conv', `
+  %$1_0 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <8 x $2> $5, <8 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1 = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 '
 )
 
 define(`unary4to16', `
-  %$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %v$1_0 = call <4 x $2> $3(<4 x $2> %$1_0)
-  %$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %v$1_1 = call <4 x $2> $3(<4 x $2> %$1_1)
-  %$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-  %v$1_2 = call <4 x $2> $3(<4 x $2> %$1_2)
-  %$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %v$1_3 = call <4 x $2> $3(<4 x $2> %$1_3)
+  %__$1_0 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %__v$1_0 = call <4 x $2> $3(<4 x $2> %__$1_0)
+  %__$1_1 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %__v$1_1 = call <4 x $2> $3(<4 x $2> %__$1_1)
+  %__$1_2 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %__v$1_2 = call <4 x $2> $3(<4 x $2> %__$1_2)
+  %__$1_3 = shufflevector <16 x $2> $4, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %__v$1_3 = call <4 x $2> $3(<4 x $2> %__$1_3)
 
-  %$1a = shufflevector <4 x $2> %v$1_0, <4 x $2> %v$1_1, 
+  %__$1a = shufflevector <4 x $2> %__v$1_0, <4 x $2> %__v$1_1, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1b = shufflevector <4 x $2> %v$1_2, <4 x $2> %v$1_3, 
+  %__$1b = shufflevector <4 x $2> %__v$1_2, <4 x $2> %__v$1_3, 
            <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %$1 = shufflevector <8 x $2> %$1a, <8 x $2> %$1b,
+  %$1 = shufflevector <8 x $2> %__$1a, <8 x $2> %__$1b,
+           <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+'
+)
+
+define(`unary4to16conv', `
+  %$1_0 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v$1_0 = call <4 x $3> $4(<4 x $2> %$1_0)
+  %$1_1 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %v$1_1 = call <4 x $3> $4(<4 x $2> %$1_1)
+  %$1_2 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %v$1_2 = call <4 x $3> $4(<4 x $2> %$1_2)
+  %$1_3 = shufflevector <16 x $2> $5, <16 x $2> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %v$1_3 = call <4 x $3> $4(<4 x $2> %$1_3)
+
+  %$1a = shufflevector <4 x $3> %v$1_0, <4 x $3> %v$1_1, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1b = shufflevector <4 x $3> %v$1_2, <4 x $3> %v$1_3, 
+           <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %$1 = shufflevector <8 x $3> %$1a, <8 x $3> %$1b,
            <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
                        i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 '
diff --git a/ispc.cpp b/ispc.cpp
index a9f5ff5c..de8fba4d 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -106,7 +106,7 @@ static void __cpuidex(int info[4], int level, int count) {
 static const char *
 lGetSystemISA() {
 #ifdef __arm__
-    return "neon";
+    return "neon-32";
 #else
     int info[4];
     __cpuid(info, 1);
@@ -187,7 +187,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
                 isa = "avx2";
             else if (!strcmp(cpu, "cortex-a9") ||
                      !strcmp(cpu, "cortex-a15"))
-                isa = "neon";
+                isa = "neon-32";
             else if (!strcmp(cpu, "core-avx-i"))
                 isa = "avx1.1";
             else if (!strcmp(cpu, "sandybridge") ||
@@ -212,7 +212,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
 
 #if !defined(__arm__)
-    if (cpu == NULL && !strcmp(isa, "neon"))
+    if (cpu == NULL && !strncmp(isa, "neon", 4))
         // If we're compiling NEON on an x86 host and the CPU wasn't
         // supplied, don't go and set the CPU based on the host...
         cpu = "cortex-a9";
@@ -246,7 +246,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     this->m_cpu = cpu;
 
     if (arch == NULL) {
-        if (!strcmp(isa, "neon"))
+        if (!strncmp(isa, "neon", 4))
             arch = "arm";
         else
             arch = "x86-64";
@@ -461,8 +461,26 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_hasGather = true;
 #endif
     }
-    else if (!strcasecmp(isa, "neon")) {
-        this->m_isa = Target::NEON;
+    else if (!strcasecmp(isa, "neon-8")) {
+        this->m_isa = Target::NEON8;
+        this->m_nativeVectorWidth = 16;
+        this->m_vectorWidth = 16;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 8;
+    }
+    else if (!strcasecmp(isa, "neon-16")) {
+        this->m_isa = Target::NEON16;
+        this->m_nativeVectorWidth = 8;
+        this->m_vectorWidth = 8;
+        this->m_attributes = "+neon,+fp16";
+        this->m_hasHalf = true; // ??
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 16;
+    }
+    else if (!strcasecmp(isa, "neon-32") || !strcasecmp(isa, "neon")) {
+        this->m_isa = Target::NEON32;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+neon,+fp16";
@@ -484,7 +502,8 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
             llvm::Reloc::Default;
         std::string featuresString = m_attributes;
         llvm::TargetOptions options;
-        if (m_isa == Target::NEON)
+        if (m_isa == Target::NEON8 || m_isa == Target::NEON16 ||
+            m_isa == Target::NEON32)
             options.FloatABIType = llvm::FloatABI::Hard;
 #if !defined(LLVM_3_1)
         if (g->opt.disableFMA == false)
@@ -618,8 +637,12 @@ Target::GetTripleString() const {
 const char *
 Target::ISAToString(ISA isa) {
     switch (isa) {
-    case Target::NEON:
-        return "neon";
+    case Target::NEON8:
+        return "neon-8";
+    case Target::NEON16:
+        return "neon-16";
+    case Target::NEON32:
+        return "neon-32";
     case Target::SSE2:
         return "sse2";
     case Target::SSE4:
diff --git a/ispc.h b/ispc.h
index 7d10b908..bf6d2642 100644
--- a/ispc.h
+++ b/ispc.h
@@ -175,7 +175,8 @@ public:
         flexible/performant of them will apear last in the enumerant.  Note
         also that __best_available_isa() needs to be updated if ISAs are
         added or the enumerant values are reordered.  */
-    enum ISA { NEON, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC, NUM_ISAS };
+    enum ISA { NEON32, NEON16, NEON8, SSE2, SSE4, AVX, AVX11, AVX2, GENERIC,
+               NUM_ISAS };
 
     /** Initializes the given Target pointer for a target of the given
         name, if the name is a known target.  Returns true if the
diff --git a/ispc.vcxproj b/ispc.vcxproj
index 96682fe3..e9bf9d97 100755
--- a/ispc.vcxproj
+++ b/ispc.vcxproj
@@ -45,8 +45,12 @@
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-generic-64-64bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32bit.cpp" />
-    <ClCompile Include="$(Configuration)\gen-bitcode-neon-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-8-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-16-64bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-32bit.cpp" />
+    <ClCompile Include="$(Configuration)\gen-bitcode-neon-32-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-32bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-64bit.cpp" />
     <ClCompile Include="$(Configuration)\gen-bitcode-sse2-x2-32bit.cpp" />
@@ -187,37 +191,78 @@
       <Message>Building gen-bitcode-sse2-x2-64bit.cpp</Message>
     </CustomBuild>
   </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-neon.ll">
-      <FileType>Document</FileType>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">builtins\util.m4</AdditionalInputs>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% builtins\target-neon.ll | python bitcode2cpp.py builtins\target-neon.ll &gt; gen-bitcode-neon.cpp</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">gen-bitcode-neon.cpp</Outputs>
-      <AdditionalInputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">builtins\util.m4</AdditionalInputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Building gen-bitcode-neon.cpp</Message>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Building gen-bitcode-neon.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
-  <ItemGroup>
-    <CustomBuild Include="builtins\target-avx1.ll">
-      <FileType>Document</FileType>
-      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
-      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
-      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
-      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
-    </CustomBuild>
-  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-8-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-8.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-8.ll | python bitcode2cpp.py builtins\target-neon-8.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-8-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-8-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-8-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-16-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-16.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-16.ll | python bitcode2cpp.py builtins\target-neon-16.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-16-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-16-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-16-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 32bit &gt; $(Configuration)/gen-bitcode-neon-32-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-neon-32.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-neon-32.ll | python bitcode2cpp.py builtins\target-neon-32.ll 64bit &gt; $(Configuration)/gen-bitcode-neon-32-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-neon-32-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-neon-common.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-neon-32-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=32 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 32bit &gt; $(Configuration)/gen-bitcode-avx1-32bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-32bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-32bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="builtins\target-avx1.ll">
+      <FileType>Document</FileType>
+      <Command>m4 -Ibuiltins/ -DLLVM_VERSION=%LLVM_VERSION% -DBUILD_OS=WINDOWS -DRUNTIME=64 builtins/target-avx1.ll | python bitcode2cpp.py builtins\target-avx1.ll 64bit &gt; $(Configuration)/gen-bitcode-avx1-64bit.cpp</Command>
+      <Outputs>$(Configuration)/gen-bitcode-avx1-64bit.cpp</Outputs>
+      <AdditionalInputs>builtins\util.m4;builtins\target-avx-common.ll;builtins\target-avx.ll</AdditionalInputs>
+      <Message>Building gen-bitcode-avx1-64bit.cpp</Message>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <CustomBuild Include="builtins\target-avx1-x2.ll">
       <FileType>Document</FileType>
diff --git a/module.cpp b/module.cpp
index 85bf242c..755a5dc4 100644
--- a/module.cpp
+++ b/module.cpp
@@ -1877,6 +1877,7 @@ Module::execPreprocessor(const char *infilename, llvm::raw_string_ostream *ostre
     char *p = targetMacro;
     while (*p) {
         *p = toupper(*p);
+        if (*p == '-') *p = '_';
         ++p;
     }
     opts.addMacroDef(targetMacro);
diff --git a/run_tests.py b/run_tests.py
index ea819ea4..c9dd8b76 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -37,7 +37,7 @@ parser.add_option("-g", "--generics-include", dest="include_file", help="Filenam
 parser.add_option("-f", "--ispc-flags", dest="ispc_flags", help="Additional flags for ispc (-g, -O1, ...)",
                   default="")
 parser.add_option('-t', '--target', dest='target',
-                  help='Set compilation target (neon, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
+                  help='Set compilation target (neon8, neon16, neon32, sse2, sse2-x2, sse4, sse4-x2, sse4-8, sse4-16, avx, avx-x2, generic-4, generic-8, generic-16, generic-32)',
                   default="sse4")
 parser.add_option('-a', '--arch', dest='arch',
                   help='Set architecture (arm, x86, x86-64)',