From 9c79d4d182ca14072583128e6b59d48a80b93102 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 12:58:02 +0200
Subject: [PATCH 01/14] addded avxh with vectorWidth=4 support, use
 --target=avxh to enable it

---
 Makefile                 |   2 +-
 builtins.cpp             |   8 +
 builtins/target-avx-h.ll | 554 +++++++++++++++++++++++++++++++++++++++
 builtins/target-avxh.ll  |  81 ++++++
 ispc.cpp                 |   9 +
 5 files changed, 653 insertions(+), 1 deletion(-)
 create mode 100644 builtins/target-avx-h.ll
 create mode 100644 builtins/target-avxh.ll

diff --git a/Makefile b/Makefile
index 09ec302d..b5bb3472 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index 886eec15..63c90337 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -920,6 +920,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
     }
     case Target::AVX: {
         switch (g->target->getVectorWidth()) {
+        case 4:
+            if (runtime32) {
+                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+            }
+            else {
+                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+            }
+            break;
         case 8:
             if (runtime32) {
                 EXPORT_MODULE(builtins_bitcode_avx1_32bit);
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
new file mode 100644
index 00000000..d56a63b9
--- /dev/null
+++ b/builtins/target-avx-h.ll
@@ -0,0 +1,554 @@
+;;  Copyright (c) 2010-2012, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Basic 4-wide definitions
+
+define(`WIDTH',`4')
+define(`MASK',`i32')
+include(`util.m4')
+
+stdlib_core()
+packed_load_and_store()
+scans()
+int64minmax()
+
+include(`target-avx-common.ll')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rcp
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ;  float iv = __rcp_v(v);
+  ;  return iv * (2. - v * iv);
+
+  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
+  ; do one N-R iteration
+  %v_iv = fmul <4 x float> %0, %call
+  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
+  %iv_mul = fmul <4 x float> %call, %two_minus
+  ret <4 x float> %iv_mul
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding floats
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
+
+define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
+  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+;; avx intrinsic
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+
+define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
+  ret <4 x double> %call
+}
+
+
+define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
+  ret <4 x double> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
+  ;  float is = __rsqrt_v(v);
+  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
+  ; Newton-Raphson iteration to improve precision
+  ;  return 0.5 * is * (3. - (v * is) * is);
+  %v_is = fmul <4 x float> %v, %is
+  %v_is_is = fmul <4 x float> %v_is, %is
+  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
+  %is_mul = fmul <4 x float> %is, %three_sub
+  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
+  ret <4 x float> %half_scale
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; sqrt
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
+  ret <4 x float> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+;; avx§ intrinsic
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
+  ret <4 x double> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; svml
+
+; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+; or, use the macro to call the 4-wide ones twice with our 8-wide
+; vectors...
+
+;;declare <4 x double> @__svml_sin4(<4 x double>)
+;;declare <4 x double> @__svml_cos4(<4 x double>)
+;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
+;;declare <4 x double> @__svml_tan4(<4 x double>)
+;;declare <4 x double> @__svml_atan4(<4 x double>)
+;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
+;;declare <4 x double> @__svml_exp4(<4 x double>)
+;;declare <4 x double> @__svml_log4(<4 x double>)
+;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
+declare <4 x float> @__svml_sin(<4 x float>)
+declare <4 x float> @__svml_cos(<4 x float>)
+declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
+declare <4 x float> @__svml_tan(<4 x float>)
+declare <4 x float> @__svml_atan(<4 x float>)
+declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
+declare <4 x float> @__svml_exp(<4 x float>)
+declare <4 x float> @__svml_log(<4 x float>)
+declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; float min/max
+
+;; sse intrinsics
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
+  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
+  ret <4 x float> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; horizontal ops
+
+;; sse intrinsic 
+declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+
+define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %v64 = zext i32 %v to i64
+  ret i64 %v64
+}
+
+define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp ne i32 %v, 0
+  ret i1 %cmp
+}
+
+define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 15
+  ret i1 %cmp
+}
+
+define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i32> %0 to <4 x float>
+  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+  %cmp = icmp eq i32 %v, 0
+  ret i1 %cmp
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal float ops
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
+
+define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
+  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
+  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
+  %scalar = extractelement <4 x float> %v2, i32 0
+  ret float %scalar
+}
+
+define float @__reduce_min_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__min_varying_float, @__min_uniform_float)
+}
+
+define float @__reduce_max_float(<4 x float>) nounwind readnone {
+  reduce4(float, @__max_varying_float, @__max_uniform_float)
+}
+
+reduce_equal(4)
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int8 ops
+
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
+{
+  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
+      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
+                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
+                                              <16 x i8> zeroinitializer)
+  %r0 = extractelement <2 x i64> %rv, i32 0
+  %r1 = extractelement <2 x i64> %rv, i32 1
+  %r = add i64 %r0, %r1
+  %r16 = trunc i64 %r to i16
+  ret i16 %r16
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int16 ops
+
+define internal <4 x i16> @__add_varying_i16(<4 x i16>,
+                                  <4 x i16>) nounwind readnone alwaysinline {
+  %r = add <4 x i16> %0, %1
+  ret <4 x i16> %r
+}
+
+define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
+  %r = add i16 %0, %1
+  ret i16 %r
+}
+
+define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
+  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int32 ops
+
+define <4 x i32> @__add_varying_int32(<4 x i32>,
+                                      <4 x i32>) nounwind readnone alwaysinline {
+  %s = add <4 x i32> %0, %1
+  ret <4 x i32> %s
+}
+
+define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
+  %s = add i32 %0, %1
+  ret i32 %s
+}
+
+define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
+}
+
+
+define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
+}
+
+
+define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
+}
+
+define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
+}
+
+define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
+  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %final0 = extractelement <4 x double> %sum1, i32 0
+  %final1 = extractelement <4 x double> %sum1, i32 2
+  %sum = fadd double %final0, %final1
+
+  ret double %sum
+}
+
+define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define <4 x i64> @__add_varying_int64(<4 x i64>,
+                                      <4 x i64>) nounwind readnone alwaysinline {
+  %s = add <4 x i64> %0, %1
+  ret <4 x i64> %s
+}
+
+define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unaligned loads/loads+broadcasts
+
+
+; no masked load instruction for i8 and i16 types??
+masked_load(i8,  1)
+masked_load(i16, 2)
+
+;; avx intrinsics
+declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
+declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
+ 
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  %floatmask = bitcast <4 x i32> %mask to <4 x float>
+  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
+  %retval = bitcast <4 x float> %floatval to <4 x i32>
+  ret <4 x i32> %retval
+}
+
+
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+  ; double up masks, bitcast to doubles
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
+
+  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
+      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %val = bitcast <4 x double> %vald to <4 x i64>
+  ret <4 x i64> %val
+}
+
+masked_load_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; masked store
+
+gen_masked_store(i8)
+gen_masked_store(i16)
+
+; note that mask is the 2nd parameter, not the 3rd one!!
+;; avx intrinsics
+declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
+declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
+
+define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                <4 x i32>) nounwind alwaysinline {
+  %ptr = bitcast <4 x i32> * %0 to i8 *
+  %val = bitcast <4 x i32> %1 to <4 x float>
+  %mask = bitcast <4 x i32> %2 to <4 x float>
+  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
+  ret void
+}
+
+define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
+                                <4 x i32> %mask) nounwind alwaysinline {
+  %ptr = bitcast <4 x i64> * %0 to i8 *
+  %val = bitcast <4 x i64> %1 to <4 x double>
+
+  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
+     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+
+  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
+
+  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
+     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+  ret void
+}
+
+
+masked_store_blend_8_16_by_4()
+
+;; sse intrinsic
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+                                             <4 x float>) nounwind readnone
+
+
+define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
+                                      <4 x i32> %mask) nounwind alwaysinline {
+  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
+  %oldValue = load <4 x i32>* %0, align 4
+  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
+  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                     <4 x float> %newAsFloat,
+                                                     <4 x float> %mask_as_float)
+  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
+  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
+  ret void
+}
+
+;; avx intrinsic
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
+                                                <8 x float>) nounwind readnone
+
+define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
+                                      <4 x i32> %i32mask) nounwind alwaysinline {
+  %oldValue = load <4 x i64>* %ptr, align 8
+  %mask = bitcast <4 x i32> %i32mask to <4 x float>
+
+  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
+  ; are actually bitcast <4 x i64> values
+  ;
+  ; set up the first four 64-bit values
+  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
+  %old01f = bitcast <4 x i64> %old01 to <8 x float>
+  %new01  = bitcast <4 x i64> %new  to <4 x i64>
+  %new01f = bitcast <4 x i64> %new01 to <8 x float>
+  ; compute mask--note that the indices are all doubled-up
+  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
+                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
+                                     i32 2, i32 2, i32 3, i32 3>
+  ; and blend them
+  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
+                                                            <8 x float> %new01f,
+                                                            <8 x float> %mask01)
+  %result01 = bitcast <8 x float> %result01f to <4 x i64>
+
+
+  %final = bitcast <4 x i64> %result01 to <4 x i64>
+  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+  ret void
+}
+
+masked_store_float_double()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; scatter
+
+gen_scatter(i8)
+gen_scatter(i16)
+gen_scatter(i32)
+gen_scatter(float)
+gen_scatter(i64)
+gen_scatter(double)
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
+define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
+  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
+  ret <4 x double> %call
+}
+
diff --git a/builtins/target-avxh.ll b/builtins/target-avxh.ll
new file mode 100644
index 00000000..98c9111d
--- /dev/null
+++ b/builtins/target-avxh.ll
@@ -0,0 +1,81 @@
+;;  Copyright (c) 2010-2011, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
+
+include(`target-avx-h.ll')
+
+rdrand_decls()
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int min/max
+
+define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
+
+  ret <4 x i32> %call
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; unsigned int min/max
+
+define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
+  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
+  ret <4 x i32> %call
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; half conversion routines
+
+ifelse(NO_HALF_DECLARES, `1', `', `
+declare float @__half_to_float_uniform(i16 %v) nounwind readnone
+declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
+declare i16 @__float_to_half_uniform(float %v) nounwind readnone
+declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
+')
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; gather
+
+gen_gather_factored(i8)
+gen_gather_factored(i16)
+gen_gather_factored(i32)
+gen_gather_factored(float)
+gen_gather_factored(i64)
+gen_gather_factored(double)
diff --git a/ispc.cpp b/ispc.cpp
index 6d4b063d..02c23568 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,6 +446,15 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
+    else if (!strcasecmp(isa, "avxh") ) {
+        fprintf(stderr, " ISA is avxh \n");
+        this->m_isa = Target::AVX;
+        this->m_nativeVectorWidth = 4;
+        this->m_vectorWidth = 4;
+        this->m_attributes = "+avx,+popcnt,+cmov";
+        this->m_maskingIsFree = false;
+        this->m_maskBitCount = 32;
+    }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
              !strcasecmp(isa, "avx1-i32x16")) {

From 320c41ffcf223f4793c39c2f445ed0aed19d6270 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:16:50 +0200
Subject: [PATCH 02/14] added svml support. experimental. for some reason all
 sybmols are visible..

---
 .gitignore                        |   4 ++
 Makefile                          |   6 +-
 builtins.cpp                      |  13 ++++
 builtins/target-avx-h.ll          |  27 ++------
 builtins/target-avx-x2.ll         |  16 +----
 builtins/target-avx.ll            |  18 ++----
 builtins/target-generic-1.ll      |  45 +++++++++----
 builtins/target-generic-common.ll |  16 ++---
 builtins/target-neon-common.ll    |  13 ++--
 builtins/target-sse2-x2.ll        |  36 +++++------
 builtins/target-sse2.ll           |  61 ++----------------
 builtins/target-sse4-16.ll        |  13 +---
 builtins/target-sse4-8.ll         |  12 +---
 builtins/target-sse4-x2.ll        |  36 +++++------
 builtins/target-sse4.ll           |  61 ++----------------
 builtins/util.m4                  |   6 ++
 stdlib.ispc                       | 102 ++++++++++++++++++++++++------
 17 files changed, 216 insertions(+), 269 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0469cf7d..3bec2ace 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,9 @@ tests*/*run
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
+*.swp
+.*
+!.gitignore
+
 
 
diff --git a/Makefile b/Makefile
index b5bb3472..43f41e09 100644
--- a/Makefile
+++ b/Makefile
@@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
 
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
 
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
 
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
 
diff --git a/builtins.cpp b/builtins.cpp
index 63c90337..139b8f04 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -582,7 +582,9 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_tan",
         "__stdlib_tanf",
         "__svml_sin",
+        "__svml_asin",
         "__svml_cos",
+        "__svml_acos",
         "__svml_sincos",
         "__svml_tan",
         "__svml_atan",
@@ -590,6 +592,17 @@ lSetInternalFunctions(llvm::Module *module) {
         "__svml_exp",
         "__svml_log",
         "__svml_pow",
+        "__svml_sinf",
+        "__svml_asinf",
+        "__svml_cosf",
+        "__svml_acosf",
+        "__svml_sincosf",
+        "__svml_tanf",
+        "__svml_atanf",
+        "__svml_atan2f",
+        "__svml_expf",
+        "__svml_logf",
+        "__svml_powf",
         "__undef_uniform",
         "__undef_varying",
         "__vec4_add_float",
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index d56a63b9..a06e5ab3 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -154,28 +154,11 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-;;declare <4 x double> @__svml_sin4(<4 x double>)
-;;declare <4 x double> @__svml_cos4(<4 x double>)
-;;declare void @__svml_sincos4(<4 x double>, <4 x double> *, <4 x double> *)
-;;declare <4 x double> @__svml_tan4(<4 x double>)
-;;declare <4 x double> @__svml_atan4(<4 x double>)
-;;declare <4 x double> @__svml_atan24(<4 x double>, <4 x double>)
-;;declare <4 x double> @__svml_exp4(<4 x double>)
-;;declare <4 x double> @__svml_log4(<4 x double>)
-;;declare <4 x double> @__svml_pow4(<4 x double>, <4 x double>)
-declare <4 x float> @__svml_sin(<4 x float>)
-declare <4 x float> @__svml_cos(<4 x float>)
-declare void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *)
-declare <4 x float> @__svml_tan(<4 x float>)
-declare <4 x float> @__svml_atan(<4 x float>)
-declare <4 x float> @__svml_atan2(<4 x float>, <4 x float>)
-declare <4 x float> @__svml_exp(<4 x float>)
-declare <4 x float> @__svml_log(<4 x float>)
-declare <4 x float> @__svml_pow(<4 x float>, <4 x float>)
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(4)
+svmld_define(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d9e0322b..d646720e 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,9 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
-; vectors...
-
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 90e2f3ac..1d33e3f9 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
-; or, use the macro to call the 4-wide ones twice with our 8-wide
-; vectors...
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_declare(8)
+svmlf_define(8)
+svmld_declare(4)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll
index 31ebcdd5..910565dd 100644
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
+declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
+declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
+declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
+
+define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
    
 }
 
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
+  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
+  ;ret <1 x float> %ret
+  ;%r = extractelement <1 x float> %0, i32 0
+  ;%s = call float @llvm.asin.f32(float %r)
+  ;%rv = insertelement <1 x float> undef, float %r, i32 0
+  ;ret <1 x float> %rv
+  unary1to1(float,@llvm.asin.f32)
+   
+}
+
+define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
    store <1 x float> %sin, <1 x float> * %1
    store <1 x float> %cos, <1 x float> * %2
    ret void
 }
 
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
   ;ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
   ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 
 }
 
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
   ret <1 x float > %0
 }
 
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.exp.f32)
 }
 
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
   ;ret <1 x float> %ret
   unary1to1(float, @llvm.log.f32)
 }
 
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
   ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
   ;ret <1 x float> %ret
   %r = extractelement <1 x float> %0, i32 0
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 2896c6b1..bc7db9ec 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 
-;; svml
-
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+;; svml
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 696b0748..92fc5ce3 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...
 
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
-declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
-declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
-declare <WIDTH x float> @__svml_log(<WIDTH x float>)
-declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
+
+include(`svml.m4')
+svmlf_stubs(WIDTH)
+svmld_stubs(WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index da22a66c..5688ebba 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index a6b206b6..236cda33 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,11 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmlf_define(4)
+svmld_stubs(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index d7f3833d..3fbbe534 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml
 
 ; FIXME
-
-declare <8 x float> @__svml_sin(<8 x float>)
-declare <8 x float> @__svml_cos(<8 x float>)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
-declare <8 x float> @__svml_tan(<8 x float>)
-declare <8 x float> @__svml_atan(<8 x float>)
-declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
-declare <8 x float> @__svml_exp(<8 x float>)
-declare <8 x float> @__svml_log(<8 x float>)
-declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
+include(`svml.m4')
+svmlf_stubs(8)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index fd4b74d7..e65077b7 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 
 ; FIXME
 
-declare <16 x float> @__svml_sin(<16 x float>)
-declare <16 x float> @__svml_cos(<16 x float>)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
-declare <16 x float> @__svml_tan(<16 x float>)
-declare <16 x float> @__svml_atan(<16 x float>)
-declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
-declare <16 x float> @__svml_exp(<16 x float>)
-declare <16 x float> @__svml_log(<16 x float>)
-declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
+include(`svml.m4')
+svmlf_stubs(16)
+svmld_stubs(16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index a7faddb3..2a69b60a 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,28 +105,28 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
+include(`svml.m4')
+svmlf_declare(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
+  unary4to8(ret, float, @__svml_asinf4, %0)
+  ret <8 x float> %ret
+}
+
+define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_cosf4, %0)
   ret <8 x float> %ret
 }
 
-define void @__svml_sincos(<8 x float>, <8 x float> *,
+define void @__svml_sincosf(<8 x float>, <8 x float> *,
                                     <8 x float> *) nounwind readnone alwaysinline {
   ; call svml_sincosf4 two times with the two 4-wide sub-vectors
   %a = shufflevector <8 x float> %0, <8 x float> undef,
@@ -155,33 +155,33 @@ define void @__svml_sincos(<8 x float>, <8 x float> *,
   ret void
 }
 
-define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_tanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_atanf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_atan2(<8 x float>,
+define <8 x float> @__svml_atan2f(<8 x float>,
                                           <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_atan2f4, %0, %1)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_expf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
+define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_logf4, %0)
   ret <8 x float> %ret
 }
 
-define <8 x float> @__svml_pow(<8 x float>,
+define <8 x float> @__svml_powf(<8 x float>,
                                         <8 x float>) nounwind readnone alwaysinline {
   binary4to8(ret, float, @__svml_powf4, %0, %1)
   ret <8 x float> %ret
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index e05b865f..686b4f84 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,11 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
 
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
-declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
-declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
-  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
-  store <4 x float> %s, <4 x float> * %1
-  ret void
-}
-
-define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
-  ret <4 x float> %ret
-}
-
-define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
-  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
-  ret <4 x float> %ret
-}
+include(`svml.m4')
+svmlf_declare(4)
+svmlf_define(4)
+svmld_declare(2)
+svmld_stubs(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/util.m4 b/builtins/util.m4
index 95e3844d..6c90c821 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -3160,6 +3160,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 
 declare double @sin(double) nounwind readnone
+declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3175,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
   ret double %r
 }
 
+define double @__stdlib_asin(double) nounwind readnone alwaysinline {
+  %r = call double @asin(double %0)
+  ret double %r
+}
+
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
   %r = call double @cos(double %0)
   ret double %r
diff --git a/stdlib.ispc b/stdlib.ispc
index e4f8844f..db9d7f36 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -2180,7 +2180,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2313,8 +2313,10 @@ static inline float asin(float x) {
     bool isnan = (x > 1);
 
     float v;
-    if (__math_lib == __math_lib_svml ||
-        __math_lib == __math_lib_system) {
+    if (__math_lib == __math_lib_svml) {
+        return __svml_asinf(x);
+    } 
+    else if (__math_lib == __math_lib_system) {
         float ret;
         foreach_active (i) {
             uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2419,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2545,18 +2547,28 @@ static inline float acos(float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline double acos(const double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
     return 1.57079637050628662109375 - asin(v);
 }
 
+__declspec(safe)
+static inline uniform double acos(const uniform double v) {
+    return 1.57079637050628662109375 - asin(v);
+}
+
 
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                           varying float * uniform cos_result) {
     if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_system) {
         foreach_active (i) {
@@ -2688,7 +2700,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2839,7 +2851,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2934,7 +2946,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
     if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -2997,7 +3009,7 @@ static inline float exp(float x_full) {
         return __exp_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3204,7 +3216,7 @@ static inline float log(float x_full) {
         return __log_varying_float(x_full);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3379,7 +3391,7 @@ static inline float pow(float a, float b) {
         return __pow_varying_float(a, b);
     }
     else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
     }
     else if (__math_lib == __math_lib_system) {
         float ret;
@@ -3469,7 +3481,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_sind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return sin((float)x);
     else {
         double ret;
@@ -3490,8 +3506,30 @@ static inline uniform double sin(uniform double x) {
 }
 
 __declspec(safe)
-static inline double cos(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+static inline double asin(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_asind(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
+        return asin((float)x);
+    else {
+        double ret;
+        foreach_active (i) {
+            uniform double r = __stdlib_asin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+__declspec(safe)
+static inline double cos(const double x) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_cosd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return cos((float)x);
     else {
         double ret;
@@ -3514,7 +3552,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                           varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
+    {
+      __svml_sincosd(x, sin_result, cos_result);
+    }
+    else if (__math_lib == __math_lib_ispc_fast) {
         float sr, cr;
         sincos((float)x, &sr, &cr);
         *sin_result = sr;
@@ -3545,7 +3587,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_tand(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return tan((float)x);
     else {
         double ret;
@@ -3589,7 +3635,11 @@ static inline uniform double atan(uniform double x) {
 
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+      return __svml_atan2d(y,x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return atan2((float)y, (float)x);
     else {
         double ret;
@@ -3611,7 +3661,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_expd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return exp((float)x);
     else {
         double ret;
@@ -3633,7 +3687,11 @@ static inline uniform double exp(uniform double x) {
 
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_logd(x);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return log((float)x);
     else {
         double ret;
@@ -3655,7 +3713,11 @@ static inline uniform double log(uniform double x) {
 
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
+    {
+        return __svml_powd(a,b);
+    }
+    else if (__math_lib == __math_lib_ispc_fast)
         return pow((float)a, (float)b);
     else {
         double ret;

From 7a326995735293a25fb44d5f7243521a57df719a Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:18:03 +0200
Subject: [PATCH 03/14] added svml.m4

---
 builtins/svml.m4 | 176 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 builtins/svml.m4

diff --git a/builtins/svml.m4 b/builtins/svml.m4
new file mode 100644
index 00000000..cc3cd979
--- /dev/null
+++ b/builtins/svml.m4
@@ -0,0 +1,176 @@
+;; svml
+
+;; stub
+define(`svmlf_stubs',`
+  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
+  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
+  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+')
+
+define(`svmld_stubs',`
+  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
+  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
+  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
+')
+
+;; single precision
+define(`svmlf_declare',`
+  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
+  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+');
+
+
+
+define(`svmlf_define',`
+  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
+    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
+    store <$1 x float> %s, <$1 x float> * %1
+    ret void
+  }
+
+  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
+    ret <$1 x float> %ret
+  }
+
+  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
+    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
+    ret <$1 x float> %ret
+  }
+')
+
+;; double precision
+define(`svmld_declare',`
+  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
+  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+')
+
+define(`svmld_define',`
+  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+
+  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
+    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
+    store <$1 x double> %s, <$1 x double> * %1
+    ret void
+  }
+
+  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
+    ret <$1 x double> %ret
+  }
+
+  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
+    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
+    ret <$1 x double> %ret
+  }
+')
+
+;; need to implement smvld for 2xvectorWidth ...:w
+
+define(`svmld2_define',`
+  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
+    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
+    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
+    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+    ret <$1 x double> %ret
+  }
+')

From 9cf8e8cbf3945df122bf0652326be1404634c0cb Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 15:23:45 +0200
Subject: [PATCH 04/14] builtins fix for double precision svml and
 __stdlib_asin

---
 builtins.cpp | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/builtins.cpp b/builtins.cpp
index 139b8f04..816d4d78 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -576,22 +576,23 @@ lSetInternalFunctions(llvm::Module *module) {
         "__stdlib_pow",
         "__stdlib_powf",
         "__stdlib_sin",
+        "__stdlib_asin",
         "__stdlib_sincos",
         "__stdlib_sincosf",
         "__stdlib_sinf",
         "__stdlib_tan",
         "__stdlib_tanf",
-        "__svml_sin",
-        "__svml_asin",
-        "__svml_cos",
-        "__svml_acos",
-        "__svml_sincos",
-        "__svml_tan",
-        "__svml_atan",
-        "__svml_atan2",
-        "__svml_exp",
-        "__svml_log",
-        "__svml_pow",
+        "__svml_sind",
+        "__svml_asind",
+        "__svml_cosd",
+        "__svml_acosd",
+        "__svml_sincosd",
+        "__svml_tand",
+        "__svml_atand",
+        "__svml_atan2d",
+        "__svml_expd",
+        "__svml_logd",
+        "__svml_powd",
         "__svml_sinf",
         "__svml_asinf",
         "__svml_cosf",

From 19379db3b60a60f2f1862a54709115bcf11c7545 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 16:48:56 +0200
Subject: [PATCH 05/14] svml cleanup

---
 builtins/svml.m4                  | 209 +++++++++---------------------
 builtins/target-avx-h.ll          |  11 +-
 builtins/target-avx-x2.ll         |   9 +-
 builtins/target-avx.ll            |  11 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-sse2-x2.ll        |   8 +-
 builtins/target-sse2.ll           |  12 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 builtins/target-sse4-x2.ll        |   9 +-
 builtins/target-sse4.ll           |  11 +-
 11 files changed, 116 insertions(+), 176 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index cc3cd979..9608dea6 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,176 +1,93 @@
 ;; svml
 
-;; stub
-define(`svmlf_stubs',`
-  declare <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline
-  declare <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline 
-  declare <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline 
+;; stubs
+define(`svml_stubs',`
+  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
+  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
+  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
 ')
 
-define(`svmld_stubs',`
-  declare <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline 
-  declare void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline 
-  declare <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline 
-')
-
-;; single precision
-define(`svmlf_declare',`
-  declare <$1 x float> @__svml_sinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_asinf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_cosf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_sincosf$1(<$1 x float> *, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_tanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atanf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_atan2f$1(<$1 x float>, <$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_expf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_logf$1(<$1 x float>) nounwind readnone
-  declare <$1 x float> @__svml_powf$1(<$1 x float>, <$1 x float>) nounwind readnone
+;; decalre __svml calls
+define(`svml_declare',`
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-
-
-define(`svmlf_define',`
-  define <$1 x float> @__svml_sinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_sinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+;; define native __svml calls
+define(`svml_define',`
+  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
-  define <$1 x float> @__svml_asinf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_asinf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_cosf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_cosf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define void @__svml_sincosf(<$1 x float>, <$1 x float> *, <$1 x float> *) nounwind readnone alwaysinline {
-    %s = call <$1 x float> @__svml_sincosf$1(<$1 x float> * %2, <$1 x float> %0)
-    store <$1 x float> %s, <$1 x float> * %1
+  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
+    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
+    store <$3 x $1> %s, <$3 x $1> * %1
     ret void
   }
 
-  define <$1 x float> @__svml_tanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_tanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atanf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atanf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_atan2f(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_atan2f$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_expf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_expf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_logf(<$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_logf$1(<$1 x float> %0)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
+    ret <$3 x $1> %ret
   }
 
-  define <$1 x float> @__svml_powf(<$1 x float>, <$1 x float>) nounwind readnone alwaysinline {
-    %ret = call <$1 x float> @__svml_powf$1(<$1 x float> %0, <$1 x float> %1)
-    ret <$1 x float> %ret
+  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
+    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
+    ret <$3 x $1> %ret
   }
 ')
 
-;; double precision
-define(`svmld_declare',`
-  declare <$1 x double> @__svml_sin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_asin$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_cos$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_sincos$1(<$1 x double> *, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_tan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_atan2$1(<$1 x double>, <$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_exp$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_log$1(<$1 x double>) nounwind readnone
-  declare <$1 x double> @__svml_pow$1(<$1 x double>, <$1 x double>) nounwind readnone
+
+;; define x2 __svml calls
+define(`svml_define_x2',`
+   svml_stubs($1,$3,$4)
 ')
 
-define(`svmld_define',`
-  define <$1 x double> @__svml_sind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_sin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-  define <$1 x double> @__svml_asind(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_asin$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-
-  define <$1 x double> @__svml_cosd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_cos$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define void @__svml_sincosd(<$1 x double>, <$1 x double> *, <$1 x double> *) nounwind readnone alwaysinline {
-    %s = call <$1 x double> @__svml_sincos$1(<$1 x double> * %2, <$1 x double> %0)
-    store <$1 x double> %s, <$1 x double> * %1
-    ret void
-  }
-
-  define <$1 x double> @__svml_tand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_tan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atand(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_atan2d(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_atan2$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_expd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_exp$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_logd(<$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_log$1(<$1 x double> %0)
-    ret <$1 x double> %ret
-  }
-
-  define <$1 x double> @__svml_powd(<$1 x double>, <$1 x double>) nounwind readnone alwaysinline {
-    %ret = call <$1 x double> @__svml_pow$1(<$1 x double> %0, <$1 x double> %1)
-    ret <$1 x double> %ret
-  }
-')
-
-;; need to implement smvld for 2xvectorWidth ...:w
-
-define(`svmld2_define',`
-  define <$1 x double> @__svml_sinxx(<$1 x double>) nounwind readnone alwaysinline {
-    %v0 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-    %v1 = shufflevector <$1 x double> %0, <$1 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-    %ret0 = call <$2 x double> @__svml_sin$2(<$2 x double> %v0)
-    %ret1 = call <$2 x double> @__svml_sin$2(<$2 x double> %v1)
-    %ret  = shufflevector <$2 x double> %ret0, <$2 x double> %ret1, <$1 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-    ret <$1 x double> %ret
-  }
+;; define x4 __svml calls
+define(`svml_define_x4',`
+   svml_stubs($1,$3,$4)
 ')
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-h.ll
index a06e5ab3..283eaddd 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-h.ll
@@ -155,10 +155,13 @@ define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(4)
-svmld_define(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define(double,4,4,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index d646720e..f3f1590a 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -138,8 +138,13 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+;; single precision
+svml_declare(float,f8,8)
+svml_define_x2(float,f8,8,f,16)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 1d33e3f9..7e7ab330 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -138,10 +138,13 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;; svml
 
 include(`svml.m4')
-svmlf_declare(8)
-svmlf_define(8)
-svmld_declare(4)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f8,8)
+svml_define(float,f8,8,f)
+
+;; double precision
+svml_declare(double,4,4)
+svml_define_x2(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index bc7db9ec..30a8b030 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,  WIDTH, f)
+svml_stubs(double, WIDTH, d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 5688ebba..9fa607a4 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -106,10 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
 
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
   unary4to8(ret, float, @__svml_sinf4, %0)
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index 236cda33..c858ccb6 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -497,10 +497,14 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmlf_define(4)
-svmld_stubs(4)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
+
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3fbbe534..3f8cd339 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svmlf_stubs(8)
-svmld_stubs(8)
+svml_stubs(float,8,f)
+svml_stubs(double,8,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index e65077b7..f43cd940 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svmlf_stubs(16)
-svmld_stubs(16)
+svml_stubs(float,16,f)
+svml_stubs(double,16,d)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index 2a69b60a..c45966e3 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -106,9 +106,12 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x4(double,2,2,d,8)
 
 
 define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index 686b4f84..eb82ab9a 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -210,10 +210,13 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ; svml stuff
 
 include(`svml.m4')
-svmlf_declare(4)
-svmlf_define(4)
-svmld_declare(2)
-svmld_stubs(8)
+;; single precision
+svml_declare(float,f4,4)
+svml_define(float,f4,4,f)
+
+;; double precision
+svml_declare(double,2,2)
+svml_define_x2(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From efc20c211061585150abb02b4720316f0e45dad5 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Wed, 11 Sep 2013 17:07:54 +0200
Subject: [PATCH 06/14] added svml support to all sse/avx modes

---
 builtins/svml.m4           | 44 ++++++++++++++++++---
 builtins/target-avx-x2.ll  |  4 +-
 builtins/target-avx.ll     |  2 +-
 builtins/target-sse2-x2.ll | 79 +------------------------------------
 builtins/target-sse2.ll    |  2 +-
 builtins/target-sse4-x2.ll | 80 +-------------------------------------
 builtins/target-sse4.ll    |  2 +-
 7 files changed, 47 insertions(+), 166 deletions(-)

diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 9608dea6..71a6a709 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -83,11 +83,43 @@ define(`svml_define',`
 
 
 ;; define x2 __svml calls
-define(`svml_define_x2',`
-   svml_stubs($1,$3,$4)
+define(`svml_define_x',`
+  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_sin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_asin$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_cos$2, %0)
+    ret <$5 x $1> %ret
+  }
+  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_tan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_atan$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_exp$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
+    unary$3to$5(ret, $1, @__svml_log$2, %0)
+    ret <$5 x $1> %ret
+  }
+  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
+    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
+    ret <$5 x $1> %ret
+  }
 ')
 
-;; define x4 __svml calls
-define(`svml_define_x4',`
-   svml_stubs($1,$3,$4)
-')
diff --git a/builtins/target-avx-x2.ll b/builtins/target-avx-x2.ll
index f3f1590a..f8fd5cd5 100644
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -140,11 +140,11 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f8,8)
-svml_define_x2(float,f8,8,f,16)
+svml_define_x(float,f8,8,f,16)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,16)
+svml_define_x(double,4,4,d,16)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-avx.ll b/builtins/target-avx.ll
index 7e7ab330..196e5ea4 100644
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -144,7 +144,7 @@ svml_define(float,f8,8,f)
 
 ;; double precision
 svml_declare(double,4,4)
-svml_define_x2(double,4,4,d,8)
+svml_define_x(double,4,4,d,8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll
index 9fa607a4..77bf1a9d 100644
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -108,86 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll
index c858ccb6..e42d4990 100644
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -503,7 +503,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll
index c45966e3..842db53f 100644
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -108,87 +108,11 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
+svml_define_x(float,f4,4,f,8)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x4(double,2,2,d,8)
-
-
-define <8 x float> @__svml_sinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_sinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_asinf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_asinf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_cosf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_cosf4, %0)
-  ret <8 x float> %ret
-}
-
-define void @__svml_sincosf(<8 x float>, <8 x float> *,
-                                    <8 x float> *) nounwind readnone alwaysinline {
-  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
-  %a = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %b = shufflevector <8 x float> %0, <8 x float> undef,
-         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-
-  %cospa = alloca <4 x float>
-  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
-
-  %cospb = alloca <4 x float>
-  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
-
-  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %sin, <8 x float> * %1
-
-  %cosa = load <4 x float> * %cospa
-  %cosb = load <4 x float> * %cospb
-  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
-         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
-                    i32 4, i32 5, i32 6, i32 7>
-  store <8 x float> %cos, <8 x float> * %2
-
-  ret void
-}
-
-define <8 x float> @__svml_tanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_tanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atanf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_atanf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_atan2f(<8 x float>,
-                                          <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_expf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_expf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_logf(<8 x float>) nounwind readnone alwaysinline {
-  unary4to8(ret, float, @__svml_logf4, %0)
-  ret <8 x float> %ret
-}
-
-define <8 x float> @__svml_powf(<8 x float>,
-                                        <8 x float>) nounwind readnone alwaysinline {
-  binary4to8(ret, float, @__svml_powf4, %0, %1)
-  ret <8 x float> %ret
-}
+svml_define_x(double,2,2,d,8)
 
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll
index eb82ab9a..88be6c59 100644
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -216,7 +216,7 @@ svml_define(float,f4,4,f)
 
 ;; double precision
 svml_declare(double,2,2)
-svml_define_x2(double,2,2,d,4)
+svml_define_x(double,2,2,d,4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

From 7364e06387e7cc02f1a144097754e03181602208 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Thu, 12 Sep 2013 12:02:42 +0200
Subject: [PATCH 07/14] added mask64

---
 Makefile                                      |  17 ++-
 builtins.cpp                                  |  10 +-
 .../{target-avxh.ll => target-avx-i64x4.ll}   |   2 +-
 ...arget-avx-h.ll => target-avx-i64x4base.ll} | 137 +++++++-----------
 builtins/util.m4                              |  76 +++++++++-
 ispc.cpp                                      |   5 +-
 llvmutil.cpp                                  |  22 ++-
 parse.yy                                      |   3 +
 stdlib.ispc                                   |   3 +
 9 files changed, 175 insertions(+), 100 deletions(-)
 rename builtins/{target-avxh.ll => target-avx-i64x4.ll} (98%)
 rename builtins/{target-avx-h.ll => target-avx-i64x4base.ll} (78%)

diff --git a/Makefile b/Makefile
index 43f41e09..92debe4f 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avxh avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 
 default: ispc
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
+
+objs/stdlib_mask64_ispc.cpp: stdlib.ispc
+	@echo Creating C++ source from $< for mask64
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
+		python stdlib2cpp.py mask64 > $@
diff --git a/builtins.cpp b/builtins.cpp
index 816d4d78..f8d4136e 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
         // check the llvm.x86.* intrinsics for now...
         if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
             llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
+            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
             Assert(id != 0);
             llvm::Type *intrinsicType =
                 llvm::Intrinsic::getType(*g->ctx, id);
@@ -936,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avxh_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avxh_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
             }
             break;
         case 8:
@@ -1105,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         // serialized version of the stdlib.ispc file to get its
         // definitions added.
         extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
         if (g->target->getISA() == Target::GENERIC &&
             g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
             yy_scan_string(stdlib_mask32_code);
@@ -1124,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
             case 32:
                 yy_scan_string(stdlib_mask32_code);
                 break;
+            case 64:
+                yy_scan_string(stdlib_mask64_code);
+                break;
             default:
                 FATAL("Unhandled mask bit size for stdlib.ispc");
             }
diff --git a/builtins/target-avxh.ll b/builtins/target-avx-i64x4.ll
similarity index 98%
rename from builtins/target-avxh.ll
rename to builtins/target-avx-i64x4.ll
index 98c9111d..d7dbb6bd 100644
--- a/builtins/target-avxh.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-h.ll')
+include(`target-avx-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-h.ll b/builtins/target-avx-i64x4base.ll
similarity index 78%
rename from builtins/target-avx-h.ll
rename to builtins/target-avx-i64x4base.ll
index 283eaddd..05bf178d 100644
--- a/builtins/target-avx-h.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -33,7 +33,7 @@
 ;; Basic 4-wide definitions
 
 define(`WIDTH',`4')
-define(`MASK',`i32')
+define(`MASK',`i64')
 include(`util.m4')
 
 stdlib_core()
@@ -185,32 +185,32 @@ define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind reado
 ; horizontal ops
 
 ;; sse intrinsic 
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
+declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
-define i64 @__movmsk(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %v64 = zext i32 %v to i64
   ret i64 %v64
 }
 
-define i1 @__any(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp ne i32 %v, 0
   ret i1 %cmp
 }
 
-define i1 @__all(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 15
   ret i1 %cmp
 }
 
-define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
-  %floatmask = bitcast <4 x i32> %0 to <4 x float>
-  %v = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %floatmask) nounwind readnone
+define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
+  %floatmask = bitcast <4 x i64> %0 to <4 x double>
+  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
   %cmp = icmp eq i32 %v, 0
   ret i1 %cmp
 }
@@ -392,7 +392,8 @@ masked_load(i16, 2)
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
  
-define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline {
+define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
+  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
   %floatmask = bitcast <4 x i32> %mask to <4 x float>
   %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
   %retval = bitcast <4 x float> %floatval to <4 x i32>
@@ -400,18 +401,11 @@ define <4 x i32> @__masked_load_i32(i8 *, <4 x i32> %mask) nounwind alwaysinline
 }
 
 
-define <4 x i64> @__masked_load_i64(i8 *, <4 x i32> %mask) nounwind alwaysinline {
-  ; double up masks, bitcast to doubles
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0d = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %mask0d)
-
-  %vald = shufflevector <4 x double> %val0d, <4 x double> undef,
-      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %val = bitcast <4 x double> %vald to <4 x i64>
-  ret <4 x i64> %val
+define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
+  %doublemask = bitcast <4 x i64> %mask to <4 x double>
+  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
+  %retval = bitcast <4 x double> %doubleval to <4 x i64>
+  ret <4 x i64> %retval
 }
 
 masked_load_float_double()
@@ -428,83 +422,62 @@ declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 
 define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                <4 x i32>) nounwind alwaysinline {
-  %ptr = bitcast <4 x i32> * %0 to i8 *
-  %val = bitcast <4 x i32> %1 to <4 x float>
-  %mask = bitcast <4 x i32> %2 to <4 x float>
+                                <4 x i64>) nounwind alwaysinline {
+  %mask32 = trunc <4 x i64> %2 to <4 x i32>
+
+  %ptr    = bitcast <4 x i32> * %0 to i8 *
+  %val    = bitcast <4 x i32> %1 to <4 x float>
+  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
   call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
   ret void
 }
 
 define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
-                                <4 x i32> %mask) nounwind alwaysinline {
-  %ptr = bitcast <4 x i64> * %0 to i8 *
-  %val = bitcast <4 x i64> %1 to <4 x double>
-
-  %mask0 = shufflevector <4 x i32> %mask, <4 x i32> undef,
-     <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
-
-  %mask0d = bitcast <8 x i32> %mask0 to <4 x double>
-
-  %val0 = shufflevector <4 x double> %val, <4 x double> undef,
-     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-
-  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask0d, <4 x double> %val0)
+                                <4 x i64>) nounwind alwaysinline {
+  %ptr  = bitcast <4 x i64> * %0 to i8 *
+  %val  = bitcast <4 x i64> %1 to <4 x double>
+  %mask = bitcast <4 x i64> %2 to <4 x double>
+  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
   ret void
 }
 
 
-masked_store_blend_8_16_by_4()
+masked_store_blend_8_16_by_4_mask64()
 
 ;; sse intrinsic
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
+declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                              <4 x float>) nounwind readnone
 
-
 define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
-                                      <4 x i32> %mask) nounwind alwaysinline {
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask          = trunc   <4 x i64> %2 to <4 x i32>
   %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
-  %oldValue = load <4 x i32>* %0, align 4
-  %oldAsFloat = bitcast <4 x i32> %oldValue to <4 x float>
-  %newAsFloat = bitcast <4 x i32> %1 to <4 x float>
-  %blend = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
-                                                     <4 x float> %newAsFloat,
-                                                     <4 x float> %mask_as_float)
+  %oldValue      = load    <4 x i32>* %0, align 4
+  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
+  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
+  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
+                                                             <4 x float> %newAsFloat,
+                                                             <4 x float> %mask_as_float)
   %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
   store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
   ret void
 }
 
 ;; avx intrinsic
-declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>,
-                                                <8 x float>) nounwind readnone
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
+                                                <4 x double>) nounwind readnone
 
-define void @__masked_store_blend_i64(<4 x i64>* nocapture %ptr, <4 x i64> %new, 
-                                      <4 x i32> %i32mask) nounwind alwaysinline {
-  %oldValue = load <4 x i64>* %ptr, align 8
-  %mask = bitcast <4 x i32> %i32mask to <4 x float>
-
-  ; Do 4x64-bit blends by doing two <8 x i32> blends, where the <8 x i32> values
-  ; are actually bitcast <4 x i64> values
-  ;
-  ; set up the first four 64-bit values
-  %old01  = bitcast <4 x i64> %oldValue to <4 x i64>
-  %old01f = bitcast <4 x i64> %old01 to <8 x float>
-  %new01  = bitcast <4 x i64> %new  to <4 x i64>
-  %new01f = bitcast <4 x i64> %new01 to <8 x float>
-  ; compute mask--note that the indices are all doubled-up
-  %mask01 = shufflevector <4 x float> %mask, <4 x float> undef,
-                          <8 x i32> <i32 0, i32 0, i32 1, i32 1,
-                                     i32 2, i32 2, i32 3, i32 3>
-  ; and blend them
-  %result01f = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f,
-                                                            <8 x float> %new01f,
-                                                            <8 x float> %mask01)
-  %result01 = bitcast <8 x float> %result01f to <4 x i64>
-
-
-  %final = bitcast <4 x i64> %result01 to <4 x i64>
-  store <4 x i64> %final, <4 x i64> * %ptr, align 8
+define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
+  %oldValue       = load    <4 x i64>* %0, align 4
+  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
+  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
+  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
+                                                                        <4 x double> %newAsDouble,
+                                                                        <4 x double> %mask_as_double)
+  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
+  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
   ret void
 }
 
diff --git a/builtins/util.m4 b/builtins/util.m4
index 6c90c821..68fa818b 100644
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
   %r = sext <$1 x i32> %0 to <$1 x i64>
   ret <$1 x i64> %r
 }
+
+define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i8>
+  ret <$1 x i8> %r
+}
+define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i16>
+  ret <$1 x i16> %r
+}
+define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
+  %r = trunc <$1 x i64> %0 to <$1 x i32>
+  ret <$1 x i32> %r
+}
+define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
+  ret <$1 x i64> %0
+}
 ')
 
 mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; ret <WIDTH x i32> %se')
+  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
+         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
+                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
+  ret <WIDTH x i32> %se
 }
 
 
@@ -3508,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 
+define(`masked_store_blend_8_16_by_4_mask64', `
+define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
+                                     <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i8> * %0, align 1
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old32 = bitcast <4 x i8> %old to i32
+    %new32 = bitcast <4 x i8> %1 to i32
+
+    %mask8 = trunc <4 x i64> %2 to <4 x i8>
+    %mask32 = bitcast <4 x i8> %mask8 to i32
+    %notmask32 = xor i32 %mask32, -1
+
+    %newmasked = and i32 %new32, %mask32
+    %oldmasked = and i32 %old32, %notmask32
+    %result = or i32 %newmasked, %oldmasked
+
+    %resultvec = bitcast i32 %result to <4 x i8>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
+  ')
+  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
+  ret void
+}
+
+define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
+                                      <4 x i64>) nounwind alwaysinline {
+  %old = load <4 x i16> * %0, align 2
+  ifelse(LLVM_VERSION,LLVM_3_0,`
+    %old64 = bitcast <4 x i16> %old to i64
+    %new64 = bitcast <4 x i16> %1 to i64
+
+    %mask16 = trunc <4 x i64> %2 to <4 x i16>
+    %mask64 = bitcast <4 x i16> %mask16 to i64
+    %notmask64 = xor i64 %mask64, -1
+
+    %newmasked = and i64 %new64, %mask64
+    %oldmasked = and i64 %old64, %notmask64
+    %result = or i64 %newmasked, %oldmasked
+
+    %resultvec = bitcast i64 %result to <4 x i16>
+  ',`
+    %m = trunc <4 x i64> %2 to <4 x i1>
+    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
+  ')
+  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
+  ret void
+}
+')
+
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                      <8 x i32>) nounwind alwaysinline {
diff --git a/ispc.cpp b/ispc.cpp
index 02c23568..046c64c4 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,14 +446,13 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avxh") ) {
-        fprintf(stderr, " ISA is avxh \n");
+    else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 4;
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
-        this->m_maskBitCount = 32;
+        this->m_maskBitCount = 64;
     }
     else if (!strcasecmp(isa, "avx-x2") ||
              !strcasecmp(isa, "avx1-x2") ||
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 180c8676..64691498 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
             llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
         break;
+    case 64:
+        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
+            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
+        break;
     default:
         FATAL("Unhandled mask width for initializing MaskType");
     }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                     true /*signed*/); // 0xffffffff
         break;
+    case 64:
+        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
+                                    true /*signed*/); // 0xffffffff
+        break;
     default:
         FATAL("Unhandled mask width for onMask");
     }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                          true /*signed*/);
         break;
+    case 64:
+        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
+                                         true /*signed*/);
+        break;
     default:
         FATAL("Unhandled mask width for offMask");
     }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
     llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
+                                   false /*unsigned*/);
+    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
         v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                    false /*unsigned*/);
     else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
     std::vector<llvm::Constant *> vals;
     for (int i = 0; i < g->target->getVectorWidth(); ++i) {
         llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
+            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
+                                       false /*unsigned*/);
+        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
             v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                        false /*unsigned*/);
         else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
diff --git a/parse.yy b/parse.yy
index 5fc01cb0..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
     case 32:
         t = AtomicType::VaryingUInt32;
         break;
+    case 64:
+        t = AtomicType::VaryingUInt64;
+        break;
     default:
         FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
     }
diff --git a/stdlib.ispc b/stdlib.ispc
index db9d7f36..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
   #define IntMaskType int32
   #define UIntMaskType unsigned int32
+#elif (ISPC_MASK_BITS == 64)
+  #define IntMaskType int64
+  #define UIntMaskType unsigned int64
 #else
   #error Unknown value of ISPC_MASK_BITS
 #endif

From 059d80cc11d0cf50d337fceb1ae04d0c3c365152 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 17:18:12 +0200
Subject: [PATCH 08/14] included suggested changes, ./tests/launch-*.ispc still
 fails. something is mask64 related, not sure what. help...

---
 .gitignore                        |   3 -
 builtins/svml.m4                  | 124 ++++++++++++++++++++++++++----
 builtins/target-avx-i64x4.ll      |   2 +-
 builtins/target-avx-i64x4base.ll  |   2 +-
 builtins/target-generic-common.ll |   4 +-
 builtins/target-neon-common.ll    |   4 +-
 builtins/target-sse4-16.ll        |   4 +-
 builtins/target-sse4-8.ll         |   4 +-
 llvmutil.cpp                      |   2 +-
 run_tests.py                      |   2 +-
 10 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3bec2ace..88fb0197 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,5 @@ examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
 *.swp
-.*
-!.gitignore
-
 
 
diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 71a6a709..0a587577 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,20 +1,61 @@
-;; svml
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-;; stubs
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
 define(`svml_stubs',`
-  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
-  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
 ')
 
-;; decalre __svml calls
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
 define(`svml_declare',`
   declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
   declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
@@ -28,7 +69,13 @@ define(`svml_declare',`
   declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-;; define native __svml calls
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
 define(`svml_define',`
   define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
     %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
@@ -82,7 +129,45 @@ define(`svml_define',`
 ')
 
 
-;; define x2 __svml calls
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
 define(`svml_define_x',`
   define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_sin$2, %0)
@@ -96,7 +181,14 @@ define(`svml_define_x',`
     unary$3to$5(ret, $1, @__svml_cos$2, %0)
     ret <$5 x $1> %ret
   }
-  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
   define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_tan$2, %0)
     ret <$5 x $1> %ret
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx-i64x4.ll
index d7dbb6bd..65490ea5 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx-i64x4base.ll
index 05bf178d..e1832030 100644
--- a/builtins/target-avx-i64x4base.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 30a8b030..2a5d1b32 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svml_stubs(float,  WIDTH, f)
-svml_stubs(double, WIDTH, d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 92fc5ce3..1c0b421f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -318,8 +318,8 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3f8cd339..72b81ff0 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svml_stubs(float,8,f)
-svml_stubs(double,8,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index f43cd940..69b355e3 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svml_stubs(float,16,f)
-svml_stubs(double,16,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 64691498..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -189,7 +189,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         break;
     case 64:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
-                                    true /*signed*/); // 0xffffffff
+                                    true /*signed*/); // 0xffffffffffffffffull
         break;
     default:
         FATAL("Unhandled mask width for onMask");
diff --git a/run_tests.py b/run_tests.py
index 9729930f..180205a0 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " " + options.ispc_flags
+ispc_exe += " -g " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 40af8d6ed564cc5970786459587ecdc487a1fc44 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Thu, 12 Sep 2013 20:25:44 +0200
Subject: [PATCH 09/14] fixed segfault in tests/launch-*.ispc. nativeVectoWidth
 in avx-i64x4 was set to 4. Fixed

---
 ispc.cpp     | 2 +-
 run_tests.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ispc.cpp b/ispc.cpp
index 046c64c4..1a99154b 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -448,7 +448,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
     }
     else if (!strcasecmp(isa, "avx-i64x4") ) {
         this->m_isa = Target::AVX;
-        this->m_nativeVectorWidth = 4;
+        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;
         this->m_attributes = "+avx,+popcnt,+cmov";
         this->m_maskingIsFree = false;
diff --git a/run_tests.py b/run_tests.py
index 180205a0..9729930f 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " -g " + options.ispc_flags
+ispc_exe += " " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)

From 715b82826634644eec8f95f40e53d16b8a587ca3 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 09:25:52 +0200
Subject: [PATCH 10/14] fixed float constants to be read as doubles

---
 lex.ll   | 4 ++--
 parse.yy | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lex.ll b/lex.ll
index 8baa627a..129f0cd5 100644
--- a/lex.ll
+++ b/lex.ll
@@ -440,13 +440,13 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)atof(yytext);
+    yylval.floatVal = atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = (float)lParseHexFloat(yytext);
+    yylval.floatVal = lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index 9a2b4fc3..b55d49e0 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float floatVal;
+    double floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -326,8 +326,8 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           (float)yylval.floatVal, @1);
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);

From a97eb7b7cb217fb8f583314612527171488b0f79 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 09:32:59 +0200
Subject: [PATCH 11/14] added clamp in double precision

---
 stdlib.ispc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/stdlib.ispc b/stdlib.ispc
index 6d7ee051..0d5c4efd 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,6 +1559,18 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
+// double
+
+__declspec(safe,cost2)
+static inline double clamp(double v, double low, double high) {
+    return min(max(v, low), high);
+}
+
+__declspec(safe,cost2)
+static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
+    return min(max(v, low), high);
+}
+
 // int8
 
 __declspec(safe,cost2)

From a9913c83377614dde2ac782e298f437e45dcbd84 Mon Sep 17 00:00:00 2001
From: egaburov <egaburov.work@gmail.com>
Date: Fri, 13 Sep 2013 10:26:15 +0200
Subject: [PATCH 12/14] changed lexer/parser to be able to read float
 constants, if they have "f"-suffix

---
 lex.ll   | 23 ++++++++++++++++++++---
 parse.yy | 11 ++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/lex.ll b/lex.ll
index 129f0cd5..7a3db71a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT,
+  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,6 +152,7 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
+    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -266,6 +267,7 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
+    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -341,6 +343,8 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
+DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
+HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -438,15 +442,28 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
+
+{DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = atof(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
+{HEX_DOUBLE_NUMBER} {
+    RT;
+    yylval.doubleVal = lParseHexFloat(yytext);
+    return TOKEN_DOUBLE_CONSTANT;
+}
+
 {FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = atof(yytext);
+    yylval.floatVal = (float)atof(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
 {HEX_FLOAT_NUMBER} {
     RT;
-    yylval.floatVal = lParseHexFloat(yytext);
+    yylval.floatVal = (float)lParseHexFloat(yytext);
     return TOKEN_FLOAT_CONSTANT;
 }
 
diff --git a/parse.yy b/parse.yy
index b55d49e0..933a3455 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,7 +149,8 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    double floatVal;
+    float  floatVal;
+    double doubleVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -185,7 +186,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -326,9 +327,13 @@ primary_expression
                            (uint64_t)yylval.intVal, @1);
     }
     | TOKEN_FLOAT_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+        $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
                            yylval.floatVal, @1);
     }
+    | TOKEN_DOUBLE_CONSTANT {
+        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
+                           yylval.doubleVal, @1);
+    }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
     }

From 9861375f0c1235ea25f68211f3a82f6dcd91874c Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 15:07:14 +0200
Subject: [PATCH 13/14] renamed avx-i64x4 -> avx1-i64x4

---
 Makefile                                                      | 2 +-
 builtins.cpp                                                  | 4 ++--
 builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll}        | 2 +-
 .../{target-avx-i64x4base.ll => target-avx1-i64x4base.ll}     | 0
 ispc.cpp                                                      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename builtins/{target-avx-i64x4.ll => target-avx1-i64x4.ll} (98%)
 rename builtins/{target-avx-i64x4base.ll => target-avx1-i64x4base.ll} (100%)

diff --git a/Makefile b/Makefile
index 92debe4f..097da238 100644
--- a/Makefile
+++ b/Makefile
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
diff --git a/builtins.cpp b/builtins.cpp
index f8d4136e..43f68833 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -937,10 +937,10 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
         switch (g->target->getVectorWidth()) {
         case 4:
             if (runtime32) {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_32bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
             }
             else {
-                EXPORT_MODULE(builtins_bitcode_avx_i64x4_64bit);
+                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
             }
             break;
         case 8:
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx1-i64x4.ll
similarity index 98%
rename from builtins/target-avx-i64x4.ll
rename to builtins/target-avx1-i64x4.ll
index 65490ea5..d183f1ce 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -29,7 +29,7 @@
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-include(`target-avx-i64x4base.ll')
+include(`target-avx1-i64x4base.ll')
 
 rdrand_decls()
 
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx1-i64x4base.ll
similarity index 100%
rename from builtins/target-avx-i64x4base.ll
rename to builtins/target-avx1-i64x4base.ll
diff --git a/ispc.cpp b/ispc.cpp
index 1a99154b..26ca0b39 100644
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,7 +446,7 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
         this->m_maskingIsFree = false;
         this->m_maskBitCount = 32;
     }
-    else if (!strcasecmp(isa, "avx-i64x4") ) {
+    else if (!strcasecmp(isa, "avx1-i64x4") ) {
         this->m_isa = Target::AVX;
         this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
         this->m_vectorWidth = 4;

From 36886971e337c555b1b339b862653111f9cf9506 Mon Sep 17 00:00:00 2001
From: Evghenii <egaburov@dds.nl>
Date: Fri, 13 Sep 2013 16:02:53 +0200
Subject: [PATCH 14/14] revert lex.ll parse.yy stdlib.ispc to state when all
 constants are floats

---
 lex.ll      | 19 +------------------
 parse.yy    | 11 +++--------
 stdlib.ispc | 12 ------------
 3 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/lex.ll b/lex.ll
index 7a3db71a..8baa627a 100644
--- a/lex.ll
+++ b/lex.ll
@@ -76,7 +76,7 @@ static int allTokens[] = {
   TOKEN_TASK, TOKEN_TRUE, TOKEN_TYPEDEF, TOKEN_UNIFORM, TOKEN_UNMASKED,
   TOKEN_UNSIGNED, TOKEN_VARYING, TOKEN_VOID, TOKEN_WHILE,
   TOKEN_STRING_C_LITERAL, TOKEN_DOTDOTDOT,
-  TOKEN_FLOAT_CONSTANT, TOKEN_DOUBLE_CONSTANT,
+  TOKEN_FLOAT_CONSTANT,
   TOKEN_INT8_CONSTANT, TOKEN_UINT8_CONSTANT,
   TOKEN_INT16_CONSTANT, TOKEN_UINT16_CONSTANT,
   TOKEN_INT32_CONSTANT, TOKEN_UINT32_CONSTANT,
@@ -152,7 +152,6 @@ void ParserInit() {
     tokenToName[TOKEN_STRING_C_LITERAL] = "\"C\"";
     tokenToName[TOKEN_DOTDOTDOT] = "...";
     tokenToName[TOKEN_FLOAT_CONSTANT] = "TOKEN_FLOAT_CONSTANT";
-    tokenToName[TOKEN_DOUBLE_CONSTANT] = "TOKEN_DOUBLE_CONSTANT";
     tokenToName[TOKEN_INT8_CONSTANT] = "TOKEN_INT8_CONSTANT";
     tokenToName[TOKEN_UINT8_CONSTANT] = "TOKEN_UINT8_CONSTANT";
     tokenToName[TOKEN_INT16_CONSTANT] = "TOKEN_INT16_CONSTANT";
@@ -267,7 +266,6 @@ void ParserInit() {
     tokenNameRemap["TOKEN_STRING_C_LITERAL"] = "\"C\"";
     tokenNameRemap["TOKEN_DOTDOTDOT"] = "\'...\'";
     tokenNameRemap["TOKEN_FLOAT_CONSTANT"] = "float constant";
-    tokenNameRemap["TOKEN_DOUBLE_CONSTANT"] = "double constant";
     tokenNameRemap["TOKEN_INT8_CONSTANT"] = "int8 constant";
     tokenNameRemap["TOKEN_UINT8_CONSTANT"] = "unsigned int8 constant";
     tokenNameRemap["TOKEN_INT16_CONSTANT"] = "int16 constant";
@@ -343,8 +341,6 @@ inline int ispcRand() {
 WHITESPACE [ \t\r]+
 INT_NUMBER (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*
 INT_NUMBER_DOTDOTDOT (([0-9]+)|(0x[0-9a-fA-F]+)|(0b[01]+))[uUlL]*[kMG]?[uUlL]*\.\.\.
-DOUBLE_NUMBER (([0-9]+|(([0-9]+\.[0-9]*?)|(\.[0-9]+)))([eE][-+]?[0-9]+)??)
-HEX_DOUBLE_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+?)
 FLOAT_NUMBER (([0-9]+|(([0-9]+\.[0-9]*[fF]?)|(\.[0-9]+)))([eE][-+]?[0-9]+)?[fF]?)
 HEX_FLOAT_NUMBER (0x[01](\.[0-9a-fA-F]*)?p[-+]?[0-9]+[fF]?)
 
@@ -442,19 +438,6 @@ L?\"(\\.|[^\\"])*\" { lStringConst(&yylval, &yylloc); return TOKEN_STRING_LITERA
 }
 
 
-
-{DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = atof(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
-{HEX_DOUBLE_NUMBER} {
-    RT;
-    yylval.doubleVal = lParseHexFloat(yytext);
-    return TOKEN_DOUBLE_CONSTANT;
-}
-
 {FLOAT_NUMBER} {
     RT;
     yylval.floatVal = (float)atof(yytext);
diff --git a/parse.yy b/parse.yy
index 933a3455..9a2b4fc3 100644
--- a/parse.yy
+++ b/parse.yy
@@ -149,8 +149,7 @@ struct ForeachDimension {
 
 %union {
     uint64_t intVal;
-    float  floatVal;
-    double doubleVal;
+    float floatVal;
     std::string *stringVal;
     const char *constCharPtr;
 
@@ -186,7 +185,7 @@ struct ForeachDimension {
 %token TOKEN_INT64_CONSTANT TOKEN_UINT64_CONSTANT
 %token TOKEN_INT32DOTDOTDOT_CONSTANT TOKEN_UINT32DOTDOTDOT_CONSTANT
 %token TOKEN_INT64DOTDOTDOT_CONSTANT TOKEN_UINT64DOTDOTDOT_CONSTANT
-%token TOKEN_FLOAT_CONSTANT TOKEN_DOUBLE_CONSTANT TOKEN_STRING_C_LITERAL
+%token TOKEN_FLOAT_CONSTANT TOKEN_STRING_C_LITERAL
 %token TOKEN_IDENTIFIER TOKEN_STRING_LITERAL TOKEN_TYPE_NAME TOKEN_NULL
 %token TOKEN_PTR_OP TOKEN_INC_OP TOKEN_DEC_OP TOKEN_LEFT_OP TOKEN_RIGHT_OP
 %token TOKEN_LE_OP TOKEN_GE_OP TOKEN_EQ_OP TOKEN_NE_OP
@@ -328,11 +327,7 @@ primary_expression
     }
     | TOKEN_FLOAT_CONSTANT {
         $$ = new ConstExpr(AtomicType::UniformFloat->GetAsConstType(),
-                           yylval.floatVal, @1);
-    }
-    | TOKEN_DOUBLE_CONSTANT {
-        $$ = new ConstExpr(AtomicType::UniformDouble->GetAsConstType(),
-                           yylval.doubleVal, @1);
+                           (float)yylval.floatVal, @1);
     }
     | TOKEN_TRUE {
         $$ = new ConstExpr(AtomicType::UniformBool->GetAsConstType(), true, @1);
diff --git a/stdlib.ispc b/stdlib.ispc
index 0d5c4efd..6d7ee051 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -1559,18 +1559,6 @@ static inline uniform float clamp(uniform float v, uniform float low, uniform fl
     return min(max(v, low), high);
 }
 
-// double
-
-__declspec(safe,cost2)
-static inline double clamp(double v, double low, double high) {
-    return min(max(v, low), high);
-}
-
-__declspec(safe,cost2)
-static inline uniform double clamp(uniform double v, uniform double low, uniform double high) {
-    return min(max(v, low), high);
-}
-
 // int8
 
 __declspec(safe,cost2)