diff --git a/.gitignore b/.gitignore
index 3bec2ace..88fb0197 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,8 +12,5 @@ examples/*/*.png
 examples/*/*.ppm
 examples/*/objs/*
 *.swp
-.*
-!.gitignore
-
 
 
diff --git a/builtins/svml.m4 b/builtins/svml.m4
index 71a6a709..0a587577 100644
--- a/builtins/svml.m4
+++ b/builtins/svml.m4
@@ -1,20 +1,61 @@
-;; svml
+;; copyright stub  :)
+;;  Copyright (c) 2013, Intel Corporation
+;;  All rights reserved.
+;;
+;;  Redistribution and use in source and binary forms, with or without
+;;  modification, are permitted provided that the following conditions are
+;;  met:
+;;
+;;    * Redistributions of source code must retain the above copyright
+;;      notice, this list of conditions and the following disclaimer.
+;;
+;;    * Redistributions in binary form must reproduce the above copyright
+;;      notice, this list of conditions and the following disclaimer in the
+;;      documentation and/or other materials provided with the distribution.
+;;
+;;    * Neither the name of Intel Corporation nor the names of its
+;;      contributors may be used to endorse or promote products derived from
+;;      this software without specific prior written permission.
+;;
+;;
+;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+;;   IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+;;   TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+;;   PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+;;   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;;   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;;   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  
 
-;; stubs
+
+;; svml macro
+
+;; svml_stubs : stubs for svml calls
+;; $1 - type ("float" or "double")
+;; $2 - svml internal function suffix ("f" for float, "d" for double)
+;; $3 - vector width
 define(`svml_stubs',`
-  declare <$2 x $1> @__svml_sin$3(<$2 x $1>) nounwind readnone alwaysinline
-  declare <$2 x $1> @__svml_asin$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_cos$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare void @__svml_sincos$3(<$2 x $1>, <$2 x $1> *, <$2 x $1> *) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_tan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_atan2$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_exp$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_log$3(<$2 x $1>) nounwind readnone alwaysinline 
-  declare <$2 x $1> @__svml_pow$3(<$2 x $1>, <$2 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone alwaysinline
+  declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_cos$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare void @__svml_sincos$2(<$3 x $1>, <$3 x $1> *, <$3 x $1> *) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_tan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_atan2$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_exp$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_log$2(<$3 x $1>) nounwind readnone alwaysinline 
+  declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone alwaysinline 
 ')
 
-;; decalre __svml calls
+;; svml_declare : declaration of __svml_* intrinsics 
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
 define(`svml_declare',`
   declare <$3 x $1> @__svml_sin$2(<$3 x $1>) nounwind readnone
   declare <$3 x $1> @__svml_asin$2(<$3 x $1>) nounwind readnone
@@ -28,7 +69,13 @@ define(`svml_declare',`
   declare <$3 x $1> @__svml_pow$2(<$3 x $1>, <$3 x $1>) nounwind readnone
 ');
 
-;; define native __svml calls
+;; defintition of __svml_* internal functions
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
 define(`svml_define',`
   define <$3 x $1> @__svml_sin$4(<$3 x $1>) nounwind readnone alwaysinline {
     %ret = call <$3 x $1> @__svml_sin$2(<$3 x $1> %0)
@@ -82,7 +129,45 @@ define(`svml_define',`
 ')
 
 
-;; define x2 __svml calls
+;; svml_define_x : defintition of __svml_* internal functions operation on extended width
+;; $1 - type ("float" or "double")
+;; $2 - __svml_* intrinsic function suffix 
+;;      float:  "f4"(sse) "f8"(avx) "f16"(avx512)
+;;      double:  "2"(sse)  "4"(avx)   "8"(avx512)
+;; $3 - vector width
+;; $4 - svml internal function suffix ("f" for float, "d" for double)
+;; $5 - extended width, must be at least twice the native vector width
+;;      contigent on existing of unary$3to$5 and binary$3to$5 macros
+
+;; *todo*: in sincos call use __svml_sincos[f][2,4,8,16] call, e.g.
+;;define void @__svml_sincosf(<8 x float>, <8 x float> *,
+;;                                    <8 x float> *) nounwind readnone alwaysinline {
+;;  ; call svml_sincosf4 two times with the two 4-wide sub-vectors
+;;  %a = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+;;  %b = shufflevector <8 x float> %0, <8 x float> undef,
+;;         <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+;;
+;;  %cospa = alloca <4 x float>
+;;  %sa = call <4 x float> @__svml_sincosf4(<4 x float> * %cospa, <4 x float> %a)
+;;
+;;  %cospb = alloca <4 x float>
+;;  %sb = call <4 x float> @__svml_sincosf4(<4 x float> * %cospb, <4 x float> %b)
+;;
+;;  %sin = shufflevector <4 x float> %sa, <4 x float> %sb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %sin, <8 x float> * %1
+;;
+;;  %cosa = load <4 x float> * %cospa
+;;  %cosb = load <4 x float> * %cospb
+;;  %cos = shufflevector <4 x float> %cosa, <4 x float> %cosb,
+;;         <8 x i32> <i32 0, i32 1, i32 2, i32 3,
+;;                    i32 4, i32 5, i32 6, i32 7>
+;;  store <8 x float> %cos, <8 x float> * %2
+;;
+;;  ret void
+;;}
 define(`svml_define_x',`
   define <$5 x $1> @__svml_sin$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_sin$2, %0)
@@ -96,7 +181,14 @@ define(`svml_define_x',`
     unary$3to$5(ret, $1, @__svml_cos$2, %0)
     ret <$5 x $1> %ret
   }
-  declare void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  define void @__svml_sincos$4(<$5 x $1>,<$5 x $1>*,<$5 x $1>*) nounwind readnone alwaysinline 
+  {
+    %s = call <$5 x $1> @__svml_sin$4(<$5 x $1> %0)
+    %c = call <$5 x $1> @__svml_cos$4(<$5 x $1> %0)
+    store <$5 x $1> %s, <$5 x $1> * %1
+    store <$5 x $1> %c, <$5 x $1> * %2
+    ret void
+  }
   define <$5 x $1> @__svml_tan$4(<$5 x $1>) nounwind readnone alwaysinline {
     unary$3to$5(ret, $1, @__svml_tan$2, %0)
     ret <$5 x $1> %ret
diff --git a/builtins/target-avx-i64x4.ll b/builtins/target-avx-i64x4.ll
index d7dbb6bd..65490ea5 100644
--- a/builtins/target-avx-i64x4.ll
+++ b/builtins/target-avx-i64x4.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2011, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-avx-i64x4base.ll b/builtins/target-avx-i64x4base.ll
index 05bf178d..e1832030 100644
--- a/builtins/target-avx-i64x4base.ll
+++ b/builtins/target-avx-i64x4base.ll
@@ -1,4 +1,4 @@
-;;  Copyright (c) 2010-2012, Intel Corporation
+;;  Copyright (c) 2013, Intel Corporation
 ;;  All rights reserved.
 ;;
 ;;  Redistribution and use in source and binary forms, with or without
diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll
index 30a8b030..2a5d1b32 100644
--- a/builtins/target-generic-common.ll
+++ b/builtins/target-generic-common.ll
@@ -209,8 +209,8 @@ declare i64 @__count_leading_zeros_i64(i64) nounwind readnone
 ;; svml
 
 include(`svml.m4')
-svml_stubs(float,  WIDTH, f)
-svml_stubs(double, WIDTH, d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; reductions
diff --git a/builtins/target-neon-common.ll b/builtins/target-neon-common.ll
index 92fc5ce3..1c0b421f 100644
--- a/builtins/target-neon-common.ll
+++ b/builtins/target-neon-common.ll
@@ -318,8 +318,8 @@ define void @__masked_store_blend_i64(<WIDTH x i64>* nocapture %ptr,
 
 
 include(`svml.m4')
-svmlf_stubs(WIDTH)
-svmld_stubs(WIDTH)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; gather
diff --git a/builtins/target-sse4-16.ll b/builtins/target-sse4-16.ll
index 3f8cd339..72b81ff0 100644
--- a/builtins/target-sse4-16.ll
+++ b/builtins/target-sse4-16.ll
@@ -210,8 +210,8 @@ define <8 x double> @__max_varying_double(<8 x double>, <8 x double>) nounwind r
 
 ; FIXME
 include(`svml.m4')
-svml_stubs(float,8,f)
-svml_stubs(double,8,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/builtins/target-sse4-8.ll b/builtins/target-sse4-8.ll
index f43cd940..69b355e3 100644
--- a/builtins/target-sse4-8.ll
+++ b/builtins/target-sse4-8.ll
@@ -223,8 +223,8 @@ define <16 x double> @__max_varying_double(<16 x double>, <16 x double>) nounwin
 ; FIXME
 
 include(`svml.m4')
-svml_stubs(float,16,f)
-svml_stubs(double,16,d)
+svml_stubs(float,f,WIDTH)
+svml_stubs(double,d,WIDTH)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 64691498..275cf794 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -189,7 +189,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target& target) {
         break;
     case 64:
         onMask = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*ctx), -1,
-                                    true /*signed*/); // 0xffffffff
+                                    true /*signed*/); // 0xffffffffffffffffull
         break;
     default:
         FATAL("Unhandled mask width for onMask");
diff --git a/run_tests.py b/run_tests.py
index 9729930f..180205a0 100755
--- a/run_tests.py
+++ b/run_tests.py
@@ -75,7 +75,7 @@ if not os.path.exists(ispc_exe):
     sys.stderr.write("Fatal error: missing ispc compiler: %s\n" % ispc_exe)
     sys.exit()
 
-ispc_exe += " " + options.ispc_flags
+ispc_exe += " -g " + options.ispc_flags
 
 if __name__ == '__main__':
     sys.stdout.write("ispc compiler: %s\n" % ispc_exe)