Merge pull request #578 from egaburov/master

added --target=avx-i64x4 & svml support for all sse/avx modes
2013-09-13 09:40:24 -07:00
parent 582cfe55b6 36886971e3
commit 06aa2067d9
22 changed files with 1148 additions and 387 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,6 @@ tests*/*run
 examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
 *.swp
--- a/23
+++ b/23
@@ -141,7 +141,7 @@ CXX_SRC=ast.cpp builtins.cpp cbackend.cpp ctx.cpp decl.cpp expr.cpp func.cpp \
 	type.cpp util.cpp
 HEADERS=ast.h builtins.h ctx.h decl.h expr.h func.h ispc.h llvmutil.h module.h \
 	opt.h stmt.h sym.h type.h util.h
-TARGETS=avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
+TARGETS=avx1-i64x4 avx1 avx1-x2 avx11 avx11-x2 avx2 avx2-x2 \
 	sse2 sse2-x2 sse4-8 sse4-16 sse4 sse4-x2 \
 	generic-4 generic-8 generic-16 generic-32 generic-64 generic-1
 ifneq ($(ARM_ENABLED), 0)
@@ -160,7 +160,7 @@ BISON_SRC=parse.yy
 FLEX_SRC=lex.ll
 OBJS=$(addprefix objs/, $(CXX_SRC:.cpp=.o) $(BUILTINS_OBJS) \
-       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o \
+       stdlib_mask1_ispc.o stdlib_mask8_ispc.o stdlib_mask16_ispc.o stdlib_mask32_ispc.o stdlib_mask64_ispc.o \
 	$(BISON_SRC:.yy=.o) $(FLEX_SRC:.ll=.o))
 default: ispc
@@ -246,15 +246,15 @@ objs/lex.o: objs/lex.cpp $(HEADERS) objs/parse.cc
 	@echo Compiling $<
 	@$(CXX) $(CXXFLAGS) -o $@ -c $<
-objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-dispatch.cpp: builtins/dispatch.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $<
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX $< | python bitcode2cpp.py $< > $@
-objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-32bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(32 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=32 $< | python bitcode2cpp.py $< 32bit > $@
-objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 $(wildcard builtins/*common.ll)
+objs/builtins-%-64bit.cpp: builtins/%.ll builtins/util.m4 builtins/svml.m4 $(wildcard builtins/*common.ll)
 	@echo Creating C++ source from builtins definition file $< \(64 bit version\)
 	@m4 -Ibuiltins/ -DLLVM_VERSION=$(LLVM_VERSION) -DBUILD_OS=UNIX -DRUNTIME=64 $< | python bitcode2cpp.py $< 64bit > $@
@@ -268,20 +268,25 @@ objs/builtins-c-64.cpp: builtins/builtins.c
 objs/stdlib_mask1_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask1
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=1 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask1 > $@
 objs/stdlib_mask8_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask8
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=8 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask8 > $@
 objs/stdlib_mask16_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask16
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=16 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask16 > $@
 objs/stdlib_mask32_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask32
-	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.1415926536 $< -o - | \
+	@$(CLANG) -E -x c -DISPC_MASK_BITS=32 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask32 > $@
 objs/stdlib_mask64_ispc.cpp: stdlib.ispc
 	@echo Creating C++ source from $< for mask64
 	@$(CLANG) -E -x c -DISPC_MASK_BITS=64 -DISPC=1 -DPI=3.14159265358979 $< -o - | \
 		python stdlib2cpp.py mask64 > $@
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -302,6 +302,7 @@ lCheckModuleIntrinsics(llvm::Module *module) {
        // check the llvm.x86.* intrinsics for now...
        if (!strncmp(funcName.c_str(), "llvm.x86.", 9)) {
            llvm::Intrinsic::ID id = (llvm::Intrinsic::ID)func->getIntrinsicID();
            if (id == 0) fprintf(stderr, "FATAL: intrinsic is not found: %s  \n", funcName.c_str());
            Assert(id != 0);
            llvm::Type *intrinsicType =
                llvm::Intrinsic::getType(*g->ctx, id);
@@ -576,20 +577,34 @@ lSetInternalFunctions(llvm::Module *module) {
        "__stdlib_pow",
        "__stdlib_powf",
        "__stdlib_sin",
        "__stdlib_asin",
        "__stdlib_sincos",
        "__stdlib_sincosf",
        "__stdlib_sinf",
        "__stdlib_tan",
        "__stdlib_tanf",
-        "__svml_sin",
+        "__svml_sind",
-        "__svml_cos",
+        "__svml_asind",
-        "__svml_sincos",
+        "__svml_cosd",
-        "__svml_tan",
+        "__svml_acosd",
-        "__svml_atan",
+        "__svml_sincosd",
-        "__svml_atan2",
+        "__svml_tand",
-        "__svml_exp",
+        "__svml_atand",
-        "__svml_log",
+        "__svml_atan2d",
-        "__svml_pow",
+        "__svml_expd",
        "__svml_logd",
        "__svml_powd",
        "__svml_sinf",
        "__svml_asinf",
        "__svml_cosf",
        "__svml_acosf",
        "__svml_sincosf",
        "__svml_tanf",
        "__svml_atanf",
        "__svml_atan2f",
        "__svml_expf",
        "__svml_logf",
        "__svml_powf",
        "__undef_uniform",
        "__undef_varying",
        "__vec4_add_float",
@@ -920,6 +935,14 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
    }
    case Target::AVX: {
        switch (g->target->getVectorWidth()) {
        case 4:
            if (runtime32) {
                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_32bit);
            }
            else {
                EXPORT_MODULE(builtins_bitcode_avx1_i64x4_64bit);
            }
            break;
        case 8:
            if (runtime32) {
                EXPORT_MODULE(builtins_bitcode_avx1_32bit);
@@ -1083,7 +1106,7 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
        // serialized version of the stdlib.ispc file to get its
        // definitions added.
        extern char stdlib_mask1_code[], stdlib_mask8_code[];
-        extern char stdlib_mask16_code[], stdlib_mask32_code[];
+        extern char stdlib_mask16_code[], stdlib_mask32_code[], stdlib_mask64_code[];
        if (g->target->getISA() == Target::GENERIC &&
            g->target->getVectorWidth() == 1) { // 1 wide uses 32 stdlib
            yy_scan_string(stdlib_mask32_code);
@@ -1102,6 +1125,9 @@ DefineStdlib(SymbolTable *symbolTable, llvm::LLVMContext *ctx, llvm::Module *mod
            case 32:
                yy_scan_string(stdlib_mask32_code);
                break;
            case 64:
                yy_scan_string(stdlib_mask64_code);
                break;
            default:
                FATAL("Unhandled mask bit size for stdlib.ispc");
            }
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -0,0 +1,217 @@
 ;; copyright stub  :)
 ;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;; svml macro
 ;; svml_stubs : stubs for svml calls
 ;; $1 - type ("float" or "double")
 ;; $2 - svml internal function suffix ("f" for float, "d" for double)
 ;; $3 - vector width
 define(`svml_stubs',`
  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
 ')
 ;; svml_declare : declaration of __svml_* intrinsics 
 ;; $1 - type ("float" or "double")
 ;; $2 - __svml_* intrinsic function suffix 
 ;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
 ;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
 ;; $3 - vector width
 define(`svml_declare',`
  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_sincos$2(<$3 x $1> *, <$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone
  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 ;; defintition of __svml_* internal functions
 ;; $1 - type ("float" or "double")
 ;; $2 - __svml_* intrinsic function suffix 
 ;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
 ;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
 ;; $3 - vector width
 ;; $4 - svml internal function suffix ("f" for float, "d" for double)
 define(`svml_define',`
  define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_asin$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_asin$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_cos$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_cos$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define void @__svml_sincos$4(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline {
    %s = call <$3 x $1> @__svml_sincos$2(<$3 x $1> * %2, <$3 x $1> %0)
    store <$3 x $1> %s, <$3 x $1> * %1
    ret void
  }
  define <$3 x $1> @__svml_tan$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_tan$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_atan$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_atan$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_atan2$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_atan2$2(<$3 x $1> %0, <$3 x $1> %1)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_exp$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_exp$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_log$4(<$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_log$2(<$3 x $1> %0)
    ret <$3 x $1> %ret
  }
  define <$3 x $1> @__svml_pow$4(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline {
    %ret = call <$3 x $1> @__svml_pow$2(<$3 x $1> %0, <$3 x $1> %1)
    ret <$3 x $1> %ret
  }
 ')
 ;; svml_define_x : defintition of __svml_* internal functions operation on extended width
 ;; $1 - type ("float" or "double")
 ;; $2 - __svml_* intrinsic function suffix 
 ;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
 ;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
 ;; $3 - vector width
 ;; $4 - svml internal function suffix ("f" for float, "d" for double)
 ;; $5 - extended width, must be at least twice the native vector width
 ;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
 ;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
 ;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
 ;;                                    <8 x float> *) nounwind readnone alwaysinline {
 ;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
 ;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
 ;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
 ;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 ;;
 ;;  %cospa = alloca <4 x float>
 ;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
 ;;
 ;;  %cospb = alloca <4 x float>
 ;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
 ;;
 ;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
 ;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 ;;                    i32 4, i32 5, i32 6, i32 7>
 ;;  store <8 x float> %sin, <8 x float> * %1
 ;;
 ;;  %cosa = load <4 x float> * %cospa
 ;;  %cosb = load <4 x float> * %cospb
 ;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
 ;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
 ;;                    i32 4, i32 5, i32 6, i32 7>
 ;;  store <8 x float> %cos, <8 x float> * %2
 ;;
 ;;  ret void
 ;;}
 define(`svml_define_x',`
  define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_sin$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_asin$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_asin$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_cos$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_cos$2, %0)
    ret <$5 x $1> %ret
  }
  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
  {
    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
    store <$5 x $1> %s, <$5 x $1> * %1
    store <$5 x $1> %c, <$5 x $1> * %2
    ret void
  }
  define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_tan$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_atan$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_atan$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_atan2$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
    binary$3to$5(ret, $1, @__svml_atan2$2, %0, %1)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_exp$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_exp$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_log$4(<$5 x $1>) nounwind readnone alwaysinline {
    unary$3to$5(ret, $1, @__svml_log$2, %0)
    ret <$5 x $1> %ret
  }
  define <$5 x $1> @__svml_pow$4(<$5 x $1>,<$5 x $1>) nounwind readnone alwaysinline {
    binary$3to$5(ret, $1, @__svml_pow$2, %0, %1)
    ret <$5 x $1> %ret
  }
 ')
--- a/builtins/target-avx-x2.ll
+++ b/builtins/target-avx-x2.ll
@@ -137,19 +137,14 @@ define <16 x float> @__sqrt_varying_float(<16 x float>) nounwind readonly always
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+include(`svml.m4')
-; or, use the macro to call the 4-wide ones 4x with our 16-wide
+;; single precision
-; vectors...
+svml_declare(float,f8,8)
 svml_define_x(float,f8,8,f,16)
-declare <16 x float> @__svml_sin(<16 x float>)
+;; double precision
-declare <16 x float> @__svml_cos(<16 x float>)
+svml_declare(double,4,4)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+svml_define_x(double,4,4,d,16)
 declare <16 x float> @__svml_tan(<16 x float>)
 declare <16 x float> @__svml_atan(<16 x float>)
 declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
 declare <16 x float> @__svml_exp(<16 x float>)
 declare <16 x float> @__svml_log(<16 x float>)
 declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-avx.ll
+++ b/builtins/target-avx.ll
@@ -137,19 +137,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
-; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
+include(`svml.m4')
-; or, use the macro to call the 4-wide ones twice with our 8-wide
+;; single precision
-; vectors...
+svml_declare(float,f8,8)
 svml_define(float,f8,8,f)
-declare <8 x float> @__svml_sin(<8 x float>)
+;; double precision
-declare <8 x float> @__svml_cos(<8 x float>)
+svml_declare(double,4,4)
-declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
+svml_define_x(double,4,4,d,8)
 declare <8 x float> @__svml_tan(<8 x float>)
 declare <8 x float> @__svml_atan(<8 x float>)
 declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
 declare <8 x float> @__svml_exp(<8 x float>)
 declare <8 x float> @__svml_log(<8 x float>)
 declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-avx1-i64x4.ll
+++ b/builtins/target-avx1-i64x4.ll
@@ -0,0 +1,81 @@
 ;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 include(`target-avx1-i64x4base.ll')
 rdrand_decls()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; int min/max
 define <4 x i32> @__min_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define <4 x i32> @__max_varying_int32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unsigned int min/max
 define <4 x i32> @__min_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 define <4 x i32> @__max_varying_uint32(<4 x i32>, <4 x i32>) nounwind readonly alwaysinline {
  %call = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %0, <4 x i32> %1)
  ret <4 x i32> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; half conversion routines
 ifelse(NO_HALF_DECLARES, `1', `', `
 declare float @__half_to_float_uniform(i16 %v) nounwind readnone
 declare <WIDTH x float> @__half_to_float_varying(<WIDTH x i16> %v) nounwind readnone
 declare i16 @__float_to_half_uniform(float %v) nounwind readnone
 declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind readnone
 ')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
 gen_gather_factored(i8)
 gen_gather_factored(i16)
 gen_gather_factored(i32)
 gen_gather_factored(float)
 gen_gather_factored(i64)
 gen_gather_factored(double)
--- a/builtins/target-avx1-i64x4base.ll
+++ b/builtins/target-avx1-i64x4base.ll
@@ -0,0 +1,513 @@
 ;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
 ;;  modification, are permitted provided that the following conditions are
 ;;  met:
 ;;
 ;;    * Redistributions of source code must retain the above copyright
 ;;      notice, this list of conditions and the following disclaimer.
 ;;
 ;;    * Redistributions in binary form must reproduce the above copyright
 ;;      notice, this list of conditions and the following disclaimer in the
 ;;      documentation and/or other materials provided with the distribution.
 ;;
 ;;    * Neither the name of Intel Corporation nor the names of its
 ;;      contributors may be used to endorse or promote products derived from
 ;;      this software without specific prior written permission.
 ;;
 ;;
 ;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 ;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 ;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 ;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 ;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 ;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 ;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 ;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 ;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 ;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 ;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Basic 4-wide definitions
 define(`WIDTH',`4')
 define(`MASK',`i64')
 include(`util.m4')
 stdlib_core()
 packed_load_and_store()
 scans()
 int64minmax()
 include(`target-avx-common.ll')
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
 ;; sse intrinsic
 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rcp_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ;  float iv = __rcp_v(v);
  ;  return iv * (2. - v * iv);
  %call = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %0)
  ; do one N-R iteration
  %v_iv = fmul <4 x float> %0, %call
  %two_minus = fsub <4 x float> <float 2., float 2., float 2., float 2.>, %v_iv
  %iv_mul = fmul <4 x float> %call, %two_minus
  ret <4 x float> %iv_mul
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding floats
 ;; sse intrinsic
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 define <4 x float> @__round_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round mode nearest 0b00 | don't signal precision exceptions 0b1000 = 8
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 8)
  ret <4 x float> %call
 }
 define <4 x float> @__floor_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 9)
  ret <4 x float> %call
 }
 define <4 x float> @__ceil_varying_float(<4 x float>) nounwind readonly alwaysinline {
  ; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10
  %call = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %0, i32 10)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rounding doubles
 ;; avx intrinsic
 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
 define <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 8)
  ret <4 x double> %call
 }
 define <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 9)
  ret <4 x double> %call
 }
 define <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
  %call = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %0, i32 10)
  ret <4 x double> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rsqrt
 ;; sse intrinsic
 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__rsqrt_varying_float(<4 x float> %v) nounwind readonly alwaysinline {
  ;  float is = __rsqrt_v(v);
  %is = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %v)
  ; Newton-Raphson iteration to improve precision
  ;  return 0.5 * is * (3. - (v * is) * is);
  %v_is = fmul <4 x float> %v, %is
  %v_is_is = fmul <4 x float> %v_is, %is
  %three_sub = fsub <4 x float> <float 3., float 3., float 3., float 3.>, %v_is_is
  %is_mul = fmul <4 x float> %is, %three_sub
  %half_scale = fmul <4 x float> <float 0.5, float 0.5, float 0.5, float 0.5>, %is_mul
  ret <4 x float> %half_scale
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; sqrt
 ;; sse intrinsic
 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
 define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %0)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision sqrt
 ;; avx<76> intrinsic
 declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
 define <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
  %call = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %0)
  ret <4 x double> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; svml
 include(`svml.m4')
 ;; single precision
 svml_declare(float,f4,4)
 svml_define(float,f4,4,f)
 ;; double precision
 svml_declare(double,4,4)
 svml_define(double,4,4,d)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
 ;; sse intrinsics
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 define <4 x float> @__max_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 define <4 x float> @__min_varying_float(<4 x float>, <4 x float>) nounwind readonly alwaysinline {
  %call = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %call
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops
 ;; sse intrinsic 
 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 define i64 @__movmsk(<4 x i64>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i64> %0 to <4 x double>
  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
  %v64 = zext i32 %v to i64
  ret i64 %v64
 }
 define i1 @__any(<4 x i64>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i64> %0 to <4 x double>
  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
  %cmp = icmp ne i32 %v, 0
  ret i1 %cmp
 }
 define i1 @__all(<4 x i64>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i64> %0 to <4 x double>
  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 15
  ret i1 %cmp
 }
 define i1 @__none(<4 x i64>) nounwind readnone alwaysinline {
  %floatmask = bitcast <4 x i64> %0 to <4 x double>
  %v = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %floatmask) nounwind readnone
  %cmp = icmp eq i32 %v, 0
  ret i1 %cmp
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal float ops
 ;; sse intrinsic
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
  %v1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0)
  %v2 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %v1, <4 x float> %v1)
  %scalar = extractelement <4 x float> %v2, i32 0
  ret float %scalar
 }
 define float @__reduce_min_float(<4 x float>) nounwind readnone {
  reduce4(float, @__min_varying_float, @__min_uniform_float)
 }
 define float @__reduce_max_float(<4 x float>) nounwind readnone {
  reduce4(float, @__max_varying_float, @__max_uniform_float)
 }
 reduce_equal(4)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int8 ops
 declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
 define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline 
 {
  %wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
      <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
                  i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
  %rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
                                              <16 x i8> zeroinitializer)
  %r0 = extractelement <2 x i64> %rv, i32 0
  %r1 = extractelement <2 x i64> %rv, i32 1
  %r = add i64 %r0, %r1
  %r16 = trunc i64 %r to i16
  ret i16 %r16
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int16 ops
 define internal <4 x i16> @__add_varying_i16(<4 x i16>,
                                  <4 x i16>) nounwind readnone alwaysinline {
  %r = add <4 x i16> %0, %1
  ret <4 x i16> %r
 }
 define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
  %r = add i16 %0, %1
  ret i16 %r
 }
 define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
  reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int32 ops
 define <4 x i32> @__add_varying_int32(<4 x i32>,
                                      <4 x i32>) nounwind readnone alwaysinline {
  %s = add <4 x i32> %0, %1
  ret <4 x i32> %s
 }
 define i32 @__add_uniform_int32(i32, i32) nounwind readnone alwaysinline {
  %s = add i32 %0, %1
  ret i32 %s
 }
 define i32 @__reduce_add_int32(<4 x i32>) nounwind readnone alwaysinline {
  reduce4(i32, @__add_varying_int32, @__add_uniform_int32)
 }
 define i32 @__reduce_min_int32(<4 x i32>) nounwind readnone alwaysinline {
  reduce4(i32, @__min_varying_int32, @__min_uniform_int32)
 }
 define i32 @__reduce_max_int32(<4 x i32>) nounwind readnone alwaysinline {
  reduce4(i32, @__max_varying_int32, @__max_uniform_int32)
 }
 define i32 @__reduce_min_uint32(<4 x i32>) nounwind readnone alwaysinline {
  reduce4(i32, @__min_varying_uint32, @__min_uniform_uint32)
 }
 define i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone alwaysinline {
  reduce4(i32, @__max_varying_uint32, @__max_uniform_uint32)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal double ops
 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
 define double @__reduce_add_double(<4 x double>) nounwind readonly alwaysinline {
  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %v1 = shufflevector <4 x double> <double 0.,double 0.,double 0.,double 0.>, <4 x double> undef,
                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ;;  %v1 = <4 x double> <double 0., double 0., double 0., double 0.>
  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0,   <4 x double> %v1)
  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
  %final0 = extractelement <4 x double> %sum1, i32 0
  %final1 = extractelement <4 x double> %sum1, i32 2
  %sum = fadd double %final0, %final1
  ret double %sum
 }
 define double @__reduce_min_double(<4 x double>) nounwind readnone alwaysinline {
  reduce4(double, @__min_varying_double, @__min_uniform_double)
 }
 define double @__reduce_max_double(<4 x double>) nounwind readnone alwaysinline {
  reduce4(double, @__max_varying_double, @__max_uniform_double)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; horizontal int64 ops
 define <4 x i64> @__add_varying_int64(<4 x i64>,
                                      <4 x i64>) nounwind readnone alwaysinline {
  %s = add <4 x i64> %0, %1
  ret <4 x i64> %s
 }
 define i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
  %s = add i64 %0, %1
  ret i64 %s
 }
 define i64 @__reduce_add_int64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__add_varying_int64, @__add_uniform_int64)
 }
 define i64 @__reduce_min_int64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
 }
 define i64 @__reduce_max_int64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
 }
 define i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
 }
 define i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone alwaysinline {
  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 ; no masked load instruction for i8 and i16 types??
 masked_load(i8,  1)
 masked_load(i16, 2)
 ;; avx intrinsics
 declare <4 x float> @llvm.x86.avx.maskload.ps(i8 *, <4 x float> %mask)
 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8 *, <4 x double> %mask)
 define <4 x i32> @__masked_load_i32(i8 *, <4 x i64> %mask64) nounwind alwaysinline {
  %mask      = trunc <4 x i64> %mask64 to <4 x i32>
  %floatmask = bitcast <4 x i32> %mask to <4 x float>
  %floatval = call <4 x float> @llvm.x86.avx.maskload.ps(i8 * %0, <4 x float> %floatmask)
  %retval = bitcast <4 x float> %floatval to <4 x i32>
  ret <4 x i32> %retval
 }
 define <4 x i64> @__masked_load_i64(i8 *, <4 x i64> %mask) nounwind alwaysinline {
  %doublemask = bitcast <4 x i64> %mask to <4 x double>
  %doubleval  = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8 * %0, <4 x double> %doublemask)
  %retval = bitcast <4 x double> %doubleval to <4 x i64>
  ret <4 x i64> %retval
 }
 masked_load_float_double()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 gen_masked_store(i8)
 gen_masked_store(i16)
 ; note that mask is the 2nd parameter, not the 3rd one!!
 ;; avx intrinsics
 declare void @llvm.x86.avx.maskstore.ps    (i8 *, <4 x float>,  <4 x float>)
 declare void @llvm.x86.avx.maskstore.pd.256(i8 *, <4 x double>, <4 x double>)
 define void @__masked_store_i32(<4 x i32>* nocapture, <4 x i32>, 
                                <4 x i64>) nounwind alwaysinline {
  %mask32 = trunc <4 x i64> %2 to <4 x i32>
  %ptr    = bitcast <4 x i32> * %0 to i8 *
  %val    = bitcast <4 x i32> %1 to <4 x float>
  %mask   = bitcast <4 x i32> %mask32 to <4 x float>
  call void @llvm.x86.avx.maskstore.ps(i8 * %ptr, <4 x float> %mask, <4 x float> %val)
  ret void
 }
 define void @__masked_store_i64(<4 x i64>* nocapture, <4 x i64>,
                                <4 x i64>) nounwind alwaysinline {
  %ptr  = bitcast <4 x i64> * %0 to i8 *
  %val  = bitcast <4 x i64> %1 to <4 x double>
  %mask = bitcast <4 x i64> %2 to <4 x double>
  call void @llvm.x86.avx.maskstore.pd.256(i8 * %ptr, <4 x double> %mask, <4 x double> %val)
  ret void
 }
 masked_store_blend_8_16_by_4_mask64()
 ;; sse intrinsic
 declare <4 x float>  @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>,
                                             <4 x float>) nounwind readnone
 define void @__masked_store_blend_i32(<4 x i32>* nocapture, <4 x i32>, 
                                      <4 x i64>) nounwind alwaysinline {
  %mask          = trunc   <4 x i64> %2 to <4 x i32>
  %mask_as_float = bitcast <4 x i32> %mask to <4 x float>
  %oldValue      = load    <4 x i32>* %0, align 4
  %oldAsFloat    = bitcast <4 x i32> %oldValue to <4 x float>
  %newAsFloat    = bitcast <4 x i32> %1 to <4 x float>
  %blend         = call    <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %oldAsFloat,
                                                             <4 x float> %newAsFloat,
                                                             <4 x float> %mask_as_float)
  %blendAsInt = bitcast <4 x float> %blend to <4 x i32>
  store <4 x i32> %blendAsInt, <4 x i32>* %0, align 4
  ret void
 }
 ;; avx intrinsic
 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>,
                                                <4 x double>) nounwind readnone
 define void @__masked_store_blend_i64(<4 x i64>* nocapture , <4 x i64>,
                                      <4 x i64>) nounwind alwaysinline {
  %mask_as_double = bitcast <4 x i64>  %2 to <4 x double>
  %oldValue       = load    <4 x i64>* %0, align 4
  %oldAsDouble    = bitcast <4 x i64>  %oldValue to <4 x double>
  %newAsDouble    = bitcast <4 x i64>  %1 to <4 x double>
  %blend          = call    <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %oldAsDouble,
                                                                        <4 x double> %newAsDouble,
                                                                        <4 x double> %mask_as_double)
  %blendAsInt = bitcast <4 x double> %blend to <4 x i64>
  store <4 x i64> %blendAsInt, <4 x i64>* %0, align 4
  ret void
 }
 masked_store_float_double()
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; scatter
 gen_scatter(i8)
 gen_scatter(i16)
 gen_scatter(i32)
 gen_scatter(float)
 gen_scatter(i64)
 gen_scatter(double)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; double precision min/max
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 define <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
  %call = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %0, <4 x double> %1)
  ret <4 x double> %call
 }
 define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone alwaysinline {
  %call = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %0, <4 x double> %1)
  ret <4 x double> %call
 }
--- a/builtins/target-generic-1.ll
+++ b/builtins/target-generic-1.ll
@@ -310,6 +310,7 @@ declare double @round (double) nounwind readnone
 ;declare float     @llvm.sqrt.f32(float %Val)
 declare double    @llvm.sqrt.f64(double %Val)
 declare float     @llvm.sin.f32(float %Val)
 declare float     @llvm.asin.f32(float %Val)
 declare float     @llvm.cos.f32(float %Val)
 declare float     @llvm.sqrt.f32(float %Val)
 declare float     @llvm.exp.f32(float %Val)
@@ -651,7 +652,18 @@ define  <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
-define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
+declare  <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline 
 declare  void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline 
 declare  <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline 
 define  <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -662,7 +674,18 @@ define  <1 x float> @__svml_sin(<1 x float>) nounwind readnone alwaysinline {
 }
-define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
  ;%s = call float @llvm.asin.f32(float %r)
  ;%rv = insertelement <1 x float> undef, float %r, i32 0
  ;ret <1 x float> %rv
  unary1to1(float,@llvm.asin.f32)
 }
 define  <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -673,18 +696,18 @@ define  <1 x float> @__svml_cos(<1 x float>) nounwind readnone alwaysinline {
 }
-define  void @__svml_sincos(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
+define  void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline {
 ;  %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0)
 ;  store <1 x float> %s, <1 x float> * %1
 ;  ret void
-   %sin = call <1 x float> @__svml_sin (<1 x float> %0)
+   %sin = call <1 x float> @__svml_sinf(<1 x float> %0)
-   %cos = call <1 x float> @__svml_cos (<1 x float> %0)
+   %cos = call <1 x float> @__svml_cosf(<1 x float> %0)
   store <1 x float> %sin, <1 x float> * %1
   store <1 x float> %cos, <1 x float> * %2
   ret void
 }
-define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0)
  ;ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -696,7 +719,7 @@ define  <1 x float> @__svml_tan(<1 x float>) nounwind readnone alwaysinline {
  ret <1 x float > %0
 }
-define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline {
 ;  %ret = call <1 x float> @__svml_atanf4(<1 x float> %0)
 ;  ret <1 x float> %ret
  ;%r = extractelement <1 x float> %0, i32 0
@@ -709,7 +732,7 @@ define  <1 x float> @__svml_atan(<1 x float>) nounwind readnone alwaysinline {
 }
-define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  ;%y = extractelement <1 x float> %0, i32 0
@@ -722,19 +745,19 @@ define  <1 x float> @__svml_atan2(<1 x float>, <1 x float>) nounwind readnone al
  ret <1 x float > %0
 }
-define  <1 x float> @__svml_exp(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.exp.f32)
 }
-define  <1 x float> @__svml_log(<1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0)
  ;ret <1 x float> %ret
  unary1to1(float, @llvm.log.f32)
 }
-define  <1 x float> @__svml_pow(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
+define  <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline {
  ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1)
  ;ret <1 x float> %ret
  %r = extractelement <1 x float> %0, i32 0
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -202,21 +202,15 @@ declare i64 @__count_trailing_zeros_i64(i64) nounwind readnone
 declare i32 @__count_leading_zeros_i32(i32) nounwind readnone
 declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 ; FIXME: need either to wire these up to the 8-wide SVML entrypoints,
 ; or, use the macro to call the 4-wide ones twice with our 8-wide
 ; vectors...
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+;; svml
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+include(`svml.m4')
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+svml_stubs(float,f,WIDTH)
-declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
+svml_stubs(double,d,WIDTH)
 declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
 declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
 declare <WIDTH x float> @__svml_log(<WIDTH x float>)
 declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -316,15 +316,10 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 ;; yuck.  We need declarations of these, even though we shouldnt ever
 ;; actually generate calls to them for the NEON target...
-declare <WIDTH x float> @__svml_sin(<WIDTH x float>)
+
-declare <WIDTH x float> @__svml_cos(<WIDTH x float>)
+include(`svml.m4')
-declare void @__svml_sincos(<WIDTH x float>, <WIDTH x float> *, <WIDTH x float> *)
+svml_stubs(float,f,WIDTH)
-declare <WIDTH x float> @__svml_tan(<WIDTH x float>)
+svml_stubs(double,d,WIDTH)
 declare <WIDTH x float> @__svml_atan(<WIDTH x float>)
 declare <WIDTH x float> @__svml_atan2(<WIDTH x float>, <WIDTH x float>)
 declare <WIDTH x float> @__svml_exp(<WIDTH x float>)
 declare <WIDTH x float> @__svml_log(<WIDTH x float>)
 declare <WIDTH x float> @__svml_pow(<WIDTH x float>, <WIDTH x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
--- a/builtins/target-sse2-x2.ll
+++ b/builtins/target-sse2-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+include(`svml.m4')
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+;; single precision
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+svml_declare(float,f4,4)
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+svml_define_x(float,f4,4,f,8)
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
+;; double precision
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+svml_declare(double,2,2)
-  unary4to8(ret, float, @__svml_sinf4, %0)
+svml_define_x(double,2,2,d,8)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }
 define void @__svml_sincos(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %cospa = alloca <4 x float>
  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
  %cospb = alloca <4 x float>
  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %sin, <8 x float> * %1
  %cosa = load <4 x float> * %cospa
  %cosb = load <4 x float> * %cospb
  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %cos, <8 x float> * %2
  ret void
 }
 define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan2(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_pow(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-sse2.ll
+++ b/builtins/target-sse2.ll
@@ -496,62 +496,15 @@ define <4 x float> @__sqrt_varying_float(<4 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+include(`svml.m4')
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+;; single precision
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+svml_declare(float,f4,4)
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+svml_define(float,f4,4,f)
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
 ;; double precision
 svml_declare(double,2,2)
 svml_define_x(double,2,2,d,4)
 define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
  store <4 x float> %s, <4 x float> * %1
  ret void
 }
 define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; float min/max
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -209,16 +209,9 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 ;; svml
 ; FIXME
-
+include(`svml.m4')
-declare <8 x float> @__svml_sin(<8 x float>)
+svml_stubs(float,f,WIDTH)
-declare <8 x float> @__svml_cos(<8 x float>)
+svml_stubs(double,d,WIDTH)
 declare void @__svml_sincos(<8 x float>, <8 x float> *, <8 x float> *)
 declare <8 x float> @__svml_tan(<8 x float>)
 declare <8 x float> @__svml_atan(<8 x float>)
 declare <8 x float> @__svml_atan2(<8 x float>, <8 x float>)
 declare <8 x float> @__svml_exp(<8 x float>)
 declare <8 x float> @__svml_log(<8 x float>)
 declare <8 x float> @__svml_pow(<8 x float>, <8 x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -222,15 +222,9 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
-declare <16 x float> @__svml_sin(<16 x float>)
+include(`svml.m4')
-declare <16 x float> @__svml_cos(<16 x float>)
+svml_stubs(float,f,WIDTH)
-declare void @__svml_sincos(<16 x float>, <16 x float> *, <16 x float> *)
+svml_stubs(double,d,WIDTH)
 declare <16 x float> @__svml_tan(<16 x float>)
 declare <16 x float> @__svml_atan(<16 x float>)
 declare <16 x float> @__svml_atan2(<16 x float>, <16 x float>)
 declare <16 x float> @__svml_exp(<16 x float>)
 declare <16 x float> @__svml_log(<16 x float>)
 declare <16 x float> @__svml_pow(<16 x float>, <16 x float>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/target-sse4-x2.ll
+++ b/builtins/target-sse4-x2.ll
@@ -105,87 +105,14 @@ define <8 x float> @__sqrt_varying_float(<8 x float>) nounwind readonly alwaysin
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+include(`svml.m4')
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+;; single precision
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+svml_declare(float,f4,4)
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+svml_define_x(float,f4,4,f,8)
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
+;; double precision
-define <8 x float> @__svml_sin(<8 x float>) nounwind readnone alwaysinline {
+svml_declare(double,2,2)
-  unary4to8(ret, float, @__svml_sinf4, %0)
+svml_define_x(double,2,2,d,8)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_cos(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_cosf4, %0)
  ret <8 x float> %ret
 }
 define void @__svml_sincos(<8 x float>, <8 x float> *,
                                    <8 x float> *) nounwind readnone alwaysinline {
  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
  %a = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  %b = shufflevector <8 x float> %0, <8 x float> undef,
         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  %cospa = alloca <4 x float>
  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
  %cospb = alloca <4 x float>
  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %sin, <8 x float> * %1
  %cosa = load <4 x float> * %cospa
  %cosb = load <4 x float> * %cospb
  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
                    i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %cos, <8 x float> * %2
  ret void
 }
 define <8 x float> @__svml_tan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_tanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_atanf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_atan2(<8 x float>,
                                          <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_atan2f4, %0, %1)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_exp(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_expf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_log(<8 x float>) nounwind readnone alwaysinline {
  unary4to8(ret, float, @__svml_logf4, %0)
  ret <8 x float> %ret
 }
 define <8 x float> @__svml_pow(<8 x float>,
                                        <8 x float>) nounwind readnone alwaysinline {
  binary4to8(ret, float, @__svml_powf4, %0, %1)
  ret <8 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/builtins/target-sse4.ll
+++ b/builtins/target-sse4.ll
@@ -209,62 +209,14 @@ define <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind r
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; svml stuff
-declare <4 x float> @__svml_sinf4(<4 x float>) nounwind readnone
+include(`svml.m4')
-declare <4 x float> @__svml_cosf4(<4 x float>) nounwind readnone
+;; single precision
-declare <4 x float> @__svml_sincosf4(<4 x float> *, <4 x float>) nounwind readnone
+svml_declare(float,f4,4)
-declare <4 x float> @__svml_tanf4(<4 x float>) nounwind readnone
+svml_define(float,f4,4,f)
 declare <4 x float> @__svml_atanf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_atan2f4(<4 x float>, <4 x float>) nounwind readnone
 declare <4 x float> @__svml_expf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_logf4(<4 x float>) nounwind readnone
 declare <4 x float> @__svml_powf4(<4 x float>, <4 x float>) nounwind readnone
-
+;; double precision
-define <4 x float> @__svml_sin(<4 x float>) nounwind readnone alwaysinline {
+svml_declare(double,2,2)
-  %ret = call <4 x float> @__svml_sinf4(<4 x float> %0)
+svml_define_x(double,2,2,d,4)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_cos(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_cosf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define void @__svml_sincos(<4 x float>, <4 x float> *, <4 x float> *) nounwind readnone alwaysinline {
  %s = call <4 x float> @__svml_sincosf4(<4 x float> * %2, <4 x float> %0)
  store <4 x float> %s, <4 x float> * %1
  ret void
 }
 define <4 x float> @__svml_tan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_tanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atanf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_atan2(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_atan2f4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_exp(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_expf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_log(<4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_logf4(<4 x float> %0)
  ret <4 x float> %ret
 }
 define <4 x float> @__svml_pow(<4 x float>, <4 x float>) nounwind readnone alwaysinline {
  %ret = call <4 x float> @__svml_powf4(<4 x float> %0, <4 x float> %1)
  ret <4 x float> %ret
 }
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
--- a/builtins/util.m4
+++ b/builtins/util.m4
@@ -947,6 +947,22 @@ define internal <$1 x i64> @convertmask_i32_i64_$1(<$1 x i32>) {
  %r = sext <$1 x i32> %0 to <$1 x i64>
  ret <$1 x i64> %r
 }
 define internal <$1 x i8> @convertmask_i64_i8_$1(<$1 x i64>) {
  %r = trunc <$1 x i64> %0 to <$1 x i8>
  ret <$1 x i8> %r
 }
 define internal <$1 x i16> @convertmask_i64_i16_$1(<$1 x i64>) {
  %r = trunc <$1 x i64> %0 to <$1 x i16>
  ret <$1 x i16> %r
 }
 define internal <$1 x i32> @convertmask_i64_i32_$1(<$1 x i64>) {
  %r = trunc <$1 x i64> %0 to <$1 x i32>
  ret <$1 x i32> %r
 }
 define internal <$1 x i64> @convertmask_i64_i64_$1(<$1 x i64>) {
  ret <$1 x i64> %0
 }
 ')
 mask_converts(WIDTH)
@@ -2689,9 +2705,13 @@ define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline {
 }
 define <WIDTH x i32> @__sext_varying_bool(<WIDTH x MASK>) nounwind readnone alwaysinline {
-  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
+;;  ifelse(MASK,i32, `ret <WIDTH x i32> %0',
-  `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
+;; `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>
-  ret <WIDTH x i32> %se')
+;; ret <WIDTH x i32> %se')
  ifelse(MASK,i32, `%se = bitcast <WIDTH x i32> %0 to <WIDTH x i32>',
         MASK,i64, `%se = trunc <WIDTH x MASK> %0 to <WIDTH x i32>',
                   `%se = sext <WIDTH x MASK> %0 to <WIDTH x i32>')
  ret <WIDTH x i32> %se
 }
@@ -3160,6 +3180,7 @@ define float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
 }
 declare double @sin(double) nounwind readnone
 declare double @asin(double) nounwind readnone
 declare double @cos(double) nounwind readnone
 declare void @sincos(double, double *, double *) nounwind readnone
 declare double @tan(double) nounwind readnone
@@ -3174,6 +3195,11 @@ define double @__stdlib_sin(double) nounwind readnone alwaysinline {
  ret double %r
 }
 define double @__stdlib_asin(double) nounwind readnone alwaysinline {
  %r = call double @asin(double %0)
  ret double %r
 }
 define double @__stdlib_cos(double) nounwind readnone alwaysinline {
  %r = call double @cos(double %0)
  ret double %r
@@ -3502,6 +3528,56 @@ define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
 }
 ')
 define(`masked_store_blend_8_16_by_4_mask64', `
 define void @__masked_store_blend_i8(<4 x i8>* nocapture, <4 x i8>,
                                     <4 x i64>) nounwind alwaysinline {
  %old = load <4 x i8> * %0, align 1
  ifelse(LLVM_VERSION,LLVM_3_0,`
    %old32 = bitcast <4 x i8> %old to i32
    %new32 = bitcast <4 x i8> %1 to i32
    %mask8 = trunc <4 x i64> %2 to <4 x i8>
    %mask32 = bitcast <4 x i8> %mask8 to i32
    %notmask32 = xor i32 %mask32, -1
    %newmasked = and i32 %new32, %mask32
    %oldmasked = and i32 %old32, %notmask32
    %result = or i32 %newmasked, %oldmasked
    %resultvec = bitcast i32 %result to <4 x i8>
  ',`
    %m = trunc <4 x i64> %2 to <4 x i1>
    %resultvec = select <4 x i1> %m, <4 x i8> %1, <4 x i8> %old
  ')
  store <4 x i8> %resultvec, <4 x i8> * %0, align 1
  ret void
 }
 define void @__masked_store_blend_i16(<4 x i16>* nocapture, <4 x i16>,
                                      <4 x i64>) nounwind alwaysinline {
  %old = load <4 x i16> * %0, align 2
  ifelse(LLVM_VERSION,LLVM_3_0,`
    %old64 = bitcast <4 x i16> %old to i64
    %new64 = bitcast <4 x i16> %1 to i64
    %mask16 = trunc <4 x i64> %2 to <4 x i16>
    %mask64 = bitcast <4 x i16> %mask16 to i64
    %notmask64 = xor i64 %mask64, -1
    %newmasked = and i64 %new64, %mask64
    %oldmasked = and i64 %old64, %notmask64
    %result = or i64 %newmasked, %oldmasked
    %resultvec = bitcast i64 %result to <4 x i16>
  ',`
    %m = trunc <4 x i64> %2 to <4 x i1>
    %resultvec = select <4 x i1> %m, <4 x i16> %1, <4 x i16> %old
  ')
  store <4 x i16> %resultvec, <4 x i16> * %0, align 2
  ret void
 }
 ')
 define(`masked_store_blend_8_16_by_8', `
 define void @__masked_store_blend_i8(<8 x i8>* nocapture, <8 x i8>,
                                     <8 x i32>) nounwind alwaysinline {
--- a/ispc.cpp
+++ b/ispc.cpp
@@ -446,6 +446,14 @@ Target::Target(const char *arch, const char *cpu, const char *isa, bool pic) :
        this->m_maskingIsFree = false;
        this->m_maskBitCount = 32;
    }
    else if (!strcasecmp(isa, "avx1-i64x4") ) {
        this->m_isa = Target::AVX;
        this->m_nativeVectorWidth = 8;  /* native vector width in terms of floats */
        this->m_vectorWidth = 4;
        this->m_attributes = "+avx,+popcnt,+cmov";
        this->m_maskingIsFree = false;
        this->m_maskBitCount = 64;
    }
    else if (!strcasecmp(isa, "avx-x2") ||
             !strcasecmp(isa, "avx1-x2") ||
             !strcasecmp(isa, "avx1-i32x16")) {
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -132,6 +132,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
            llvm::VectorType::get(llvm::Type::getInt32Ty(*ctx), target.getVectorWidth());
        break;
    case 64:
        LLVMTypes::MaskType = LLVMTypes::BoolVectorType =
            llvm::VectorType::get(llvm::Type::getInt64Ty(*ctx), target.getVectorWidth());
        break;
    default:
        FATAL("Unhandled mask width for initializing MaskType");
    }
@@ -183,6 +187,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
        onMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), -1,
                                    true /*signed*/); // 0xffffffff
        break;
    case 64:
        onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
                                    true /*signed*/); // 0xffffffffffffffffull
        break;
    default:
        FATAL("Unhandled mask width for onMask");
    }
@@ -210,6 +218,10 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
        offMask = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*ctx), 0,
                                         true /*signed*/);
        break;
    case 64:
        offMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), 0,
                                         true /*signed*/);
        break;
    default:
        FATAL("Unhandled mask width for offMask");
    }
@@ -480,7 +492,10 @@ LLVMUInt64Vector(const uint64_t *ivec) {
 llvm::Constant *
 LLVMBoolVector(bool b) {
    llvm::Constant *v;
-    if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+    if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
        v = llvm::ConstantInt::get(LLVMTypes::Int64Type, b ? 0xffffffffffffffffull : 0,
                                   false /*unsigned*/);
    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
        v = llvm::ConstantInt::get(LLVMTypes::Int32Type, b ? 0xffffffff : 0,
                                   false /*unsigned*/);
    else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
@@ -506,7 +521,10 @@ LLVMBoolVector(const bool *bvec) {
    std::vector<llvm::Constant *> vals;
    for (int i = 0; i < g->target->getVectorWidth(); ++i) {
        llvm::Constant *v;
-        if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
+        if (LLVMTypes::BoolVectorType == LLVMTypes::Int64VectorType)
            v = llvm::ConstantInt::get(LLVMTypes::Int64Type, bvec[i] ? 0xffffffffffffffffull : 0,
                                       false /*unsigned*/);
        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int32VectorType)
            v = llvm::ConstantInt::get(LLVMTypes::Int32Type, bvec[i] ? 0xffffffff : 0,
                                       false /*unsigned*/);
        else if (LLVMTypes::BoolVectorType == LLVMTypes::Int16VectorType)
--- a/parse.yy
+++ b/parse.yy
@@ -2183,6 +2183,9 @@ static void lAddMaskToSymbolTable(SourcePos pos) {
    case 32:
        t = AtomicType::VaryingUInt32;
        break;
    case 64:
        t = AtomicType::VaryingUInt64;
        break;
    default:
        FATAL("Unhandled mask bitsize in lAddMaskToSymbolTable");
    }
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -50,6 +50,9 @@
 #elif (ISPC_MASK_BITS == 32)
  #define IntMaskType int32
  #define UIntMaskType unsigned int32
 #elif (ISPC_MASK_BITS == 64)
  #define IntMaskType int64
  #define UIntMaskType unsigned int64
 #else
  #error Unknown value of ISPC_MASK_BITS
 #endif
@@ -2180,7 +2183,7 @@ static inline uniform float frexp(uniform float x, uniform int * uniform pw2) {
 __declspec(safe)
 static inline float sin(float x_full) {
    if (__math_lib == __math_lib_svml) {
-        return __svml_sin(x_full);
+        return __svml_sinf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -2313,8 +2316,10 @@ static inline float asin(float x) {
    bool isnan = (x > 1);
    float v;
-    if (__math_lib == __math_lib_svml ||
+    if (__math_lib == __math_lib_svml) {
-        __math_lib == __math_lib_system) {
+        return __svml_asinf(x);
    } 
    else if (__math_lib == __math_lib_system) {
        float ret;
        foreach_active (i) {
            uniform float r = __stdlib_asinf(extract(x, i));
@@ -2417,7 +2422,7 @@ static inline uniform float asin(uniform float x) {
 __declspec(safe)
 static inline float cos(float x_full) {
    if (__math_lib == __math_lib_svml) {
-        return __svml_cos(x_full);
+        return __svml_cosf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -2545,18 +2550,28 @@ static inline float acos(float v) {
    return 1.57079637050628662109375 - asin(v);
 }
 __declspec(safe)
 static inline double acos(const double v) {
    return 1.57079637050628662109375 - asin(v);
 }
 __declspec(safe)
 static inline uniform float acos(uniform float v) {
    return 1.57079637050628662109375 - asin(v);
 }
 __declspec(safe)
 static inline uniform double acos(const uniform double v) {
    return 1.57079637050628662109375 - asin(v);
 }
 __declspec(safe)
 static inline void sincos(float x_full, varying float * uniform sin_result, 
                          varying float * uniform cos_result) {
    if (__math_lib == __math_lib_svml) {
-        __svml_sincos(x_full, sin_result, cos_result);
+        __svml_sincosf(x_full, sin_result, cos_result);
    }
    else if (__math_lib == __math_lib_system) {
        foreach_active (i) {
@@ -2688,7 +2703,7 @@ static inline void sincos(uniform float x_full, uniform float * uniform sin_resu
 __declspec(safe)
 static inline float tan(float x_full) {
    if (__math_lib == __math_lib_svml) {
-        return __svml_tan(x_full);
+        return __svml_tanf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -2839,7 +2854,7 @@ static inline uniform float tan(uniform float x_full) {
 __declspec(safe)
 static inline float atan(float x_full) {
    if (__math_lib == __math_lib_svml) {
-        return __svml_atan(x_full);
+        return __svml_atanf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -2934,7 +2949,7 @@ static inline uniform float atan(uniform float x_full) {
 __declspec(safe)
 static inline float atan2(float y, float x) {
    if (__math_lib == __math_lib_svml) {
-        return __svml_atan2(y, x);
+        return __svml_atan2f(y, x);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -2997,7 +3012,7 @@ static inline float exp(float x_full) {
        return __exp_varying_float(x_full);
    }
    else if (__math_lib == __math_lib_svml) {
-        return __svml_exp(x_full);
+        return __svml_expf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -3204,7 +3219,7 @@ static inline float log(float x_full) {
        return __log_varying_float(x_full);
    }
    else if (__math_lib == __math_lib_svml) {
-        return __svml_log(x_full);
+        return __svml_logf(x_full);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -3379,7 +3394,7 @@ static inline float pow(float a, float b) {
        return __pow_varying_float(a, b);
    }
    else if (__math_lib == __math_lib_svml) {
-        return __svml_pow(a, b);
+        return __svml_powf(a, b);
    }
    else if (__math_lib == __math_lib_system) {
        float ret;
@@ -3469,7 +3484,11 @@ static inline uniform double frexp(uniform double x, uniform int * uniform pw2)
 __declspec(safe)
 static inline double sin(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
      return __svml_sind(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return sin((float)x);
    else {
        double ret;
@@ -3490,8 +3509,30 @@ static inline uniform double sin(uniform double x) {
 }
 __declspec(safe)
-static inline double cos(double x) {
+static inline double asin(const double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
      return __svml_asind(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return asin((float)x);
    else {
        double ret;
        foreach_active (i) {
            uniform double r = __stdlib_asin(extract(x, i));
            ret = insert(ret, i, r);
        }
        return ret;
    }
 }
 __declspec(safe)
 static inline double cos(const double x) {
    if (__math_lib == __math_lib_svml) 
    {
      return __svml_cosd(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return cos((float)x);
    else {
        double ret;
@@ -3514,7 +3555,11 @@ static inline uniform double cos(uniform double x) {
 __declspec(safe)
 static inline void sincos(double x, varying double * uniform sin_result,
                          varying double * uniform cos_result) {
-    if (__math_lib == __math_lib_ispc_fast) {
+    if (__math_lib == __math_lib_svml) 
    {
      __svml_sincosd(x, sin_result, cos_result);
    }
    else if (__math_lib == __math_lib_ispc_fast) {
        float sr, cr;
        sincos((float)x, &sr, &cr);
        *sin_result = sr;
@@ -3545,7 +3590,11 @@ static inline void sincos(uniform double x, uniform double * uniform sin_result,
 __declspec(safe)
 static inline double tan(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
      return __svml_tand(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return tan((float)x);
    else {
        double ret;
@@ -3589,7 +3638,11 @@ static inline uniform double atan(uniform double x) {
 __declspec(safe)
 static inline double atan2(double y, double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
      return __svml_atan2d(y,x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return atan2((float)y, (float)x);
    else {
        double ret;
@@ -3611,7 +3664,11 @@ static inline uniform double atan2(uniform double y, uniform double x) {
 __declspec(safe)
 static inline double exp(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
        return __svml_expd(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return exp((float)x);
    else {
        double ret;
@@ -3633,7 +3690,11 @@ static inline uniform double exp(uniform double x) {
 __declspec(safe)
 static inline double log(double x) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
        return __svml_logd(x);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return log((float)x);
    else {
        double ret;
@@ -3655,7 +3716,11 @@ static inline uniform double log(uniform double x) {
 __declspec(safe)
 static inline double pow(double a, double b) {
-    if (__math_lib == __math_lib_ispc_fast)
+    if (__math_lib == __math_lib_svml) 
    {
        return __svml_powd(a,b);
    }
    else if (__math_lib == __math_lib_ispc_fast)
        return pow((float)a, (float)b);
    else {
        double ret;