diff --git a/builtins.cpp b/builtins.cpp
index ef8eea0d..6583f7dd 100644
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -104,6 +104,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                  AtomicType::UniformInt64, false);
     else if (t == LLVMTypes::FloatPointerType)
         return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::DoublePointerType)
+        return new ReferenceType(AtomicType::UniformDouble, false);
     else if (t == LLVMTypes::Int32VectorPointerType)
         return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
                                                  AtomicType::VaryingInt32, false);
@@ -112,6 +114,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                  AtomicType::VaryingInt64, false);
     else if (t == LLVMTypes::FloatVectorPointerType)
         return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (t == LLVMTypes::DoubleVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingDouble, false);
     else if (llvm::isa<const llvm::PointerType>(t)) {
         const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);
 
diff --git a/llvmutil.cpp b/llvmutil.cpp
index 1f64e06b..755c375f 100644
--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -50,6 +50,7 @@ const llvm::Type *LLVMTypes::Int64PointerType = NULL;
 const llvm::Type *LLVMTypes::FloatType = NULL;
 const llvm::Type *LLVMTypes::FloatPointerType = NULL;
 const llvm::Type *LLVMTypes::DoubleType = NULL;
+const llvm::Type *LLVMTypes::DoublePointerType = NULL;
 
 const llvm::VectorType *LLVMTypes::MaskType = NULL;
 const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
@@ -61,6 +62,7 @@ const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
 const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+const llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
 const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;
 
 llvm::Constant *LLVMTrue = NULL;
@@ -83,6 +85,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
     LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
     LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
+    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);
 
     // Note that both the mask and bool vectors are vector of int32s
     // (not i1s).  LLVM ends up generating much better SSE code with
@@ -103,6 +106,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
     LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
     LLVMTypes::DoubleVectorType = 
         llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
     LLVMTypes::VoidPointerVectorType = 
         llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);
 
diff --git a/llvmutil.h b/llvmutil.h
index 3a5a4e4c..1a26ae7e 100644
--- a/llvmutil.h
+++ b/llvmutil.h
@@ -61,6 +61,7 @@ struct LLVMTypes {
     static const llvm::Type *FloatType;
     static const llvm::Type *FloatPointerType;
     static const llvm::Type *DoubleType;
+    static const llvm::Type *DoublePointerType;
 
     static const llvm::VectorType *MaskType;
     static const llvm::VectorType *BoolVectorType;
@@ -72,6 +73,7 @@ struct LLVMTypes {
     static const llvm::VectorType *FloatVectorType;
     static const llvm::Type *FloatVectorPointerType;
     static const llvm::VectorType *DoubleVectorType;
+    static const llvm::Type *DoubleVectorPointerType;
     static const llvm::ArrayType *VoidPointerVectorType;
 };
 
diff --git a/stdlib-avx.ll b/stdlib-avx.ll
index 9bf223da..947248dc 100644
--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -42,6 +42,7 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
+int64minmax(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -77,7 +78,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
+;; rounding floats
 
 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -141,6 +142,56 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
   ret float %rs
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
@@ -318,11 +369,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
   %call = call i32 @llvm.ctpop.i32(i32 %0)
   ret i32 %call
 }
 
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -403,6 +461,81 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
 }
 
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %scalar1 = extractelement <4 x double> %sum0, i32 0
+  %scalar2 = extractelement <4 x double> %sum1, i32 1
+  %sum = fadd double %scalar1, %scalar2
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>,
+                                               <8 x i64>) nounwind readnone alwaysinline {
+  %s = add <8 x i64> %0, %1
+  ret <8 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts
 
diff --git a/stdlib-sse.ll b/stdlib-sse.ll
index cb9ee295..77d52873 100644
--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -37,6 +37,7 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 int8_16(4)
+int64minmax(4)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
   ret float %ret
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
@@ -279,6 +328,55 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
  }
 
 
+define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 
@@ -389,53 +487,3 @@ gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-
-define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll
index d1573e21..99711181 100644
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -152,6 +152,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
   ret float %binop.i
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max
 
@@ -252,7 +286,7 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ; it does generate non-POPCNT code and in particular better code than
 ; the below does.)
 
-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
 entry:
   br label %loop
 
@@ -269,6 +303,16 @@ exit:
   ret i32 %newcount
 }
 
+define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %vec = bitcast i64 %0 to <2 x i32>
+  %v0 = extractelement <2 x i32> %vec, i32 0
+  %v1 = extractelement <2 x i32> %vec, i32 1
+  %c0 = call i32 @__popcnt_int32(i32 %v0)
+  %c1 = call i32 @__popcnt_int32(i32 %v1)
+  %sum = add i32 %c0, %c1
+  ret i32 %sum
+}
+
 
 define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
   %v1 = shufflevector <4 x float> %v, <4 x float> undef,
diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll
index 30b6f43b..e0fcec4a 100644
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -40,7 +40,7 @@ packed_load_and_store(4)
 include(`stdlib-sse.ll')
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; rounding floats
 
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -106,7 +106,52 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; integer min/max
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round2to4double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round2to4double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -163,11 +208,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
   %call = call i32 @llvm.ctpop.i32(i32 %0)
   ret i32 %call
 }
 
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll
index 009c1c5b..83baaecd 100644
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -39,6 +39,7 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
+int64minmax(8)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -258,7 +259,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 }
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
+;; int32 min/max
 
 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -380,6 +381,60 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
   reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }
 
+define internal <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <4 x i64> @__add_varying_int64(<4 x i64>,
+                                               <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store
 
@@ -465,7 +520,7 @@ gen_scatter(8, i32)
 gen_scatter(8, i64)
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; float rounding
 
 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -526,16 +581,68 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
   ret float %rs
 }
 
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round2to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round2to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 
-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
   %call = call i32 @llvm.ctpop.i32(i32 %0)
   ret i32 %call
 }
 
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
 
 define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
diff --git a/stdlib.ispc b/stdlib.ispc
index 432d7528..10fd828b 100644
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -147,30 +147,57 @@ static inline int64 shuffle(int64 v0, int64 v1, int i) {
 
 // x[i]
 static inline uniform float extract(float x, uniform int i) {
-    return __extract(x, i);
+    return floatbits(__extract_int32((int)intbits(x), i));
+}
+
+static inline uniform int extract(int x, uniform int i) {
+    return __extract_int32(x, i);
+}
+
+static inline uniform unsigned int extract(unsigned int x, uniform int i) {
+    return __extract_int32(x, (unsigned int)i);
+}
+
+static inline uniform double extract(double x, uniform int i) {
+    return doublebits(__extract_int64((int64)intbits(x), i));
+}
+
+static inline uniform int64 extract(int64 x, uniform int i) {
+    return __extract_int64(x, i);
+}
+
+static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) {
+    return __extract_int64(x, (unsigned int)i);
 }
 
 // x[i] = v
 static inline float insert(float x, uniform int i, uniform float v) {
-    return __insert(x, i, v);
-}
-
-static inline uniform int extract(int x, uniform int i) {
-    return intbits(extract(floatbits(x), i));
+    return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
 }
 
 static inline int insert(int x, uniform int i, uniform int v) {
-    return intbits(insert(floatbits(x), i, floatbits(v)));
+    return __insert_int32(x, i, v);
 }
 
-static inline uniform unsigned int extract(unsigned int x, uniform int i) {
-    return intbits(extract(floatbits(x), i));
+static inline unsigned int insert(unsigned int x, uniform int i, 
+                                  uniform unsigned int v) {
+    return __insert_int32(x, (unsigned int)i, v);
 }
 
-static inline unsigned int insert(unsigned int x, uniform int i, uniform unsigned int v) {
-    return intbits(insert(floatbits(x), i, floatbits(v)));
+static inline double insert(double x, uniform int i, uniform double v) {
+    return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v)));
 }
 
+static inline int64 insert(int64 x, uniform int i, uniform int64 v) {
+    return __insert_int64(x, i, v);
+}
+
+static inline unsigned int64 insert(unsigned int64 x, uniform int i, 
+                                    uniform unsigned int64 v) {
+    return __insert_int64(x, (unsigned int)i, v);
+}
+
+
 static inline uniform bool any(bool v) {
     // We only care about whether "any" is true for the active program instances,
     // so we have to make v with the current program mask.
@@ -185,20 +212,30 @@ static inline uniform bool all(bool v) {
 }
 
 static inline uniform int popcnt(uniform int v) {
-    return __popcnt(v);
+    return __popcnt_int32(v);
+}
+
+static inline uniform int popcnt(uniform int64 v) {
+    return (int32)__popcnt_int64(v);
 }
 
 static inline int popcnt(int v) {
     int r;
-    uniform int i;
-    for (i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < programCount; ++i)
+        r = insert(r, i, popcnt(extract(v, i)));
+    return (r & __mask);
+}
+
+static inline int popcnt(int64 v) {
+    int r;
+    for (uniform int i = 0; i < programCount; ++i)
         r = insert(r, i, popcnt(extract(v, i)));
     return (r & __mask);
 }
 
 static inline uniform int popcnt(bool v) {
     // As with any() and all(), only count across the active lanes
-    return __popcnt(__movmsk(v & __mask));
+    return __popcnt_int32(__movmsk(v & __mask));
 }
 
 static inline uniform int lanemask() {
@@ -270,6 +307,64 @@ static inline uniform unsigned int reduce_max(unsigned int v) {
     return __reduce_max_uint32(__mask ? v : 0);
 }
 
+
+static inline uniform double reduce_add(double x) {
+    // zero the lanes where the mask is off
+    return __reduce_add_double(__mask ? x : 0.);
+}
+
+static inline uniform double reduce_min(double v) {
+    int64 iflt_max = 0x7ff0000000000000; // infinity
+    // Must use __doublebits_varying_int64, not doublebits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max));
+}
+
+static inline uniform double reduce_max(double v) {
+    const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
+    // Must use __doublebits_varying_int64, not doublebits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
+}
+
+static inline uniform int64 reduce_add(int64 x) {
+    // Zero out the values for lanes that aren't running
+    return __reduce_add_int64(x & (int64)__mask);
+}
+
+static inline uniform int64 reduce_min(int64 v) {
+    // Set values for non-running lanes to the maximum integer value so
+    // they don't affect the result.
+    int64 int_max = 0x7fffffffffffffff;
+    return __reduce_min_int64(__mask ? v : int_max);
+}
+
+static inline uniform int64 reduce_max(int64 v) {
+    // Set values for non-running lanes to the minimum integer value so
+    // they don't affect the result.
+    int64 int_min = 0x8000000000000000;
+    return __reduce_max_int64(__mask ? v : int_min);
+}
+
+static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_add_int64(x & (int64)__mask);
+}
+
+static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
+    // Set values for non-running lanes to the maximum unsigned integer
+    // value so they don't affect the result.
+    unsigned int64 uint_max = 0xffffffffffffffff;
+    return __reduce_min_uint64(__mask ? v : uint_max);
+}
+
+static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_max_uint64(__mask ? v : 0);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // packed load, store
 
@@ -425,14 +520,37 @@ static inline uniform float abs(uniform float a) {
     return floatbits(i);
 }
 
+static inline double abs(double a) {
+    // zeroing the high bit clears the sign
+    unsigned int64 i = intbits(a);
+    i &= 0x7fffffffffffffff;
+    return doublebits(i);
+}
+
+static inline uniform double abs(uniform double a) {
+    uniform unsigned int64 i = intbits(a);
+    i &= 0x7fffffffffffffff;
+    return doublebits(i);
+}
+
 static inline unsigned int signbits(float x) {
     unsigned int i = intbits(x);
-    return (i & 0x80000000u);
+    return (i & 0x80000000);
 }
 
 static inline uniform unsigned int signbits(uniform float x) {
     uniform unsigned int i = intbits(x);
-    return (i & 0x80000000u);
+    return (i & 0x80000000);
+}
+
+static inline unsigned int64 signbits(double x) {
+    unsigned int64 i = intbits(x);
+    return (i & 0x8000000000000000);
+}
+
+static inline uniform unsigned int64 signbits(uniform double x) {
+    uniform unsigned int64 i = intbits(x);
+    return (i & 0x8000000000000000);
 }
 
 static inline float round(float x) {
@@ -443,6 +561,14 @@ static inline uniform float round(uniform float x) {
     return __round_uniform_float(x);
 }
 
+static inline double round(double x) {
+    return __round_varying_double(x);
+}
+
+static inline uniform double round(uniform double x) {
+    return __round_uniform_double(x);
+}
+
 static inline float floor(float x) {
     return __floor_varying_float(x);
 }
@@ -451,6 +577,14 @@ static inline uniform float floor(uniform float x) {
     return __floor_uniform_float(x);
 }
 
+static inline double floor(double x) {
+    return __floor_varying_double(x);
+}
+
+static inline uniform double floor(uniform double x) {
+    return __floor_uniform_double(x);
+}
+
 static inline float ceil(float x) {
     return __ceil_varying_float(x);
 }
@@ -459,6 +593,14 @@ static inline uniform float ceil(uniform float x) {
     return __ceil_uniform_float(x);
 }
 
+static inline double ceil(double x) {
+    return __ceil_varying_double(x);
+}
+
+static inline uniform double ceil(uniform double x) {
+    return __ceil_uniform_double(x);
+}
+
 static inline float rcp(float v) {
     return __rcp_varying_float(v);
 }
@@ -467,14 +609,6 @@ static inline uniform float rcp(uniform float v) {
     return __rcp_uniform_float(v);
 }
 
-static inline float sqrt(float v) {
-    return __sqrt_varying_float(v);
-}
-
-static inline uniform float sqrt(uniform float v) {
-    return __sqrt_uniform_float(v);
-}
-
 static inline float min(float a, float b) {
     return __min_varying_float(a, b);
 }
@@ -483,6 +617,14 @@ static inline uniform float min(uniform float a, uniform float b) {
     return __min_uniform_float(a, b);
 }
 
+static inline double min(double a, double b) {
+    return __min_varying_double(a, b);
+}
+
+static inline uniform double min(uniform double a, uniform double b) {
+    return __min_uniform_double(a, b);
+}
+
 static inline float max(float a, float b) {
     return __max_varying_float(a, b);
 }
@@ -491,6 +633,14 @@ static inline uniform float max(uniform float a, uniform float b) {
     return __max_uniform_float(a, b);
 }
 
+static inline double max(double a, double b) {
+    return __max_varying_double(a, b);
+}
+
+static inline uniform double max(uniform double a, uniform double b) {
+    return __max_uniform_double(a, b);
+}
+
 static inline unsigned int min(unsigned int a, unsigned int b) {
     return __min_varying_uint32(a, b);
 }
@@ -523,6 +673,38 @@ static inline uniform int max(uniform int a, uniform int b) {
     return __max_uniform_int32(a, b);
 }
 
+static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
+    return __min_varying_uint64(a, b);
+}
+
+static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) {
+    return __min_uniform_uint64(a, b);
+}
+
+static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) {
+    return __max_varying_uint64(a, b);
+}
+
+static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) {
+    return __max_uniform_uint64(a, b);
+}
+
+static inline int64 min(int64 a, int64 b) {
+    return __min_varying_int64(a, b);
+}
+
+static inline uniform int64 min(uniform int64 a, uniform int64 b) {
+    return __min_uniform_int64(a, b);
+}
+
+static inline int64 max(int64 a, int64 b) {
+    return __max_varying_int64(a, b);
+}
+
+static inline uniform int64 max(uniform int64 a, uniform int64 b) {
+    return __max_uniform_int64(a, b);
+}
+
 static inline float clamp(float v, float low, float high) {
     return min(max(v, low), high);
 }
@@ -536,7 +718,16 @@ static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int
 }
 
 static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low, 
-                                  uniform unsigned int high) {
+                                         uniform unsigned int high) {
+    return min(max(v, low), high);
+}
+
+static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, unsigned int64 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int64 clamp(uniform unsigned int64 v, uniform unsigned int64 low, 
+                                           uniform unsigned int64 high) {
     return min(max(v, low), high);
 }
 
@@ -548,8 +739,24 @@ static inline uniform int clamp(uniform int v, uniform int low, uniform int high
     return min(max(v, low), high);
 }
 
+static inline int64 clamp(int64 v, int64 low, int64 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform int64 clamp(uniform int64 v, uniform int64 low, uniform int64 high) {
+    return min(max(v, low), high);
+}
+
 ///////////////////////////////////////////////////////////////////////////
-// Transcendentals
+// Transcendentals (float precision)
+
+static inline float sqrt(float v) {
+    return __sqrt_varying_float(v);
+}
+
+static inline uniform float sqrt(uniform float v) {
+    return __sqrt_uniform_float(v);
+}
 
 static inline float rsqrt(float v) {
     return __rsqrt_varying_float(v);
@@ -612,7 +819,7 @@ static inline float sin(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_sin(extract(x_full, i));
+            uniform float r = __stdlib_sinf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -669,7 +876,7 @@ static inline float sin(float x_full) {
 static inline uniform float sin(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_sin(x_full);
+        return __stdlib_sinf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -741,7 +948,7 @@ static inline float cos(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_cos(extract(x_full, i));
+            uniform float r = __stdlib_cosf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -797,7 +1004,7 @@ static inline float cos(float x_full) {
 static inline uniform float cos(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_cos(x_full);
+        return __stdlib_cosf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -868,7 +1075,7 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
             if ((mask & (1 << i)) == 0)
                 continue;
             uniform float s, c;
-            __stdlib_sincos(extract(x_full, i), s, c);
+            __stdlib_sincosf(extract(x_full, i), s, c);
             sin_result = insert(sin_result, i, s);
             cos_result = insert(cos_result, i, c);
         }
@@ -930,10 +1137,10 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
 
 
 static inline void sincos(uniform float x_full, reference uniform float sin_result,
-                   reference uniform float cos_result) {
+                          reference uniform float cos_result) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        __stdlib_sincos(x_full, sin_result, cos_result);
+        __stdlib_sincosf(x_full, sin_result, cos_result);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -1001,7 +1208,7 @@ static inline float tan(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_tan(extract(x_full, i));
+            uniform float r = __stdlib_tanf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1075,7 +1282,7 @@ static inline float tan(float x_full) {
 static inline uniform float tan(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_tan(x_full);
+        return __stdlib_tanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -1153,7 +1360,7 @@ static inline float atan(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_atan(extract(x_full, i));
+            uniform float r = __stdlib_atanf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1199,7 +1406,7 @@ static inline float atan(float x_full) {
 static inline uniform float atan(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_atan(x_full);
+        return __stdlib_atanf(x_full);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -1249,7 +1456,7 @@ static inline float atan2(float y, float x) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_atan2(extract(y, i), extract(x, i));
+            uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1283,7 +1490,7 @@ static inline float atan2(float y, float x) {
 static inline uniform float atan2(uniform float y, uniform float x) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_atan2(y, x);
+        return __stdlib_atan2f(y, x);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -1310,7 +1517,7 @@ static inline float exp(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_exp(extract(x_full, i));
+            uniform float r = __stdlib_expf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1384,7 +1591,7 @@ static inline float exp(float x_full) {
 static inline uniform float exp(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_exp(x_full);
+        return __stdlib_expf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
         uniform float z = floor(1.44269504088896341f * x_full + 0.5f); 
@@ -1485,7 +1692,7 @@ static inline void __range_reduce_log(float input, reference float reduced, refe
 
 
 static inline void __range_reduce_log(uniform float input, reference uniform float reduced, 
-                               reference uniform int exponent) {
+                                      reference uniform int exponent) {
     uniform int int_version = intbits(input);
     static const uniform int nonexponent_mask = 0x807FFFFF;
 
@@ -1509,7 +1716,7 @@ static inline float log(float x_full) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_log(extract(x_full, i));
+            uniform float r = __stdlib_logf(extract(x_full, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1592,7 +1799,7 @@ static inline float log(float x_full) {
 static inline uniform float log(uniform float x_full) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_log(x_full);
+        return __stdlib_logf(x_full);
     }
     else if (__math_lib == __math_lib_ispc_fast) {
         uniform int e;
@@ -1679,7 +1886,7 @@ static inline float pow(float a, float b) {
         for (uniform int i = 0; i < programCount; ++i) {
             if ((mask & (1 << i)) == 0)
                 continue;
-            uniform float r = __stdlib_pow(extract(a, i), extract(b, i));
+            uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
             ret = insert(ret, i, r);
         }
         return ret;
@@ -1693,7 +1900,7 @@ static inline float pow(float a, float b) {
 static inline uniform float pow(uniform float a, uniform float b) {
     if (__math_lib == __math_lib_system ||
         __math_lib == __math_lib_svml) {
-        return __stdlib_pow(a, b);
+        return __stdlib_powf(a, b);
     }
     else if (__math_lib == __math_lib_ispc || 
              __math_lib == __math_lib_ispc_fast) {
@@ -1701,6 +1908,273 @@ static inline uniform float pow(uniform float a, uniform float b) {
     }
 }
 
+///////////////////////////////////////////////////////////////////////////
+// Transcendentals (double precision)
+
+static inline double sqrt(double v) {
+    return __sqrt_varying_double(v);
+}
+
+static inline uniform double sqrt(uniform double v) {
+    return __sqrt_uniform_double(v);
+}
+
+static inline double ldexp(double x, int n) {
+    unsigned int64 ex = 0x7ff0000000000000;
+    unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix = ix & ~0x7ff0000000000000;  // clear exponent
+    int64 n64 = ((int64)n << 52) + ex;
+    ix |= n64; // insert new exponent
+    return doublebits(ix);
+}
+
+static inline uniform double ldexp(uniform double x, uniform int n) {
+    uniform unsigned int64 ex = 0x7ff0000000000000;
+    uniform unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix = ix & ~0x7ff0000000000000;  // clear exponent
+    uniform int n64 = ((int64)n << 52) + ex;
+    ix |= n64; // insert new exponent
+    return doublebits(ix);
+}
+
+static inline double frexp(double x, reference int pw2) {
+    unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
+    unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7ff0000000000000;  // clear exponent
+    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
+    return doublebits(ix);
+}
+
+static inline uniform double frexp(uniform double x, reference uniform int pw2) {
+    uniform unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
+    uniform unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7ff0000000000000;  // clear exponent
+    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
+    return doublebits(ix);
+}
+
+static inline double sin(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return sin((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_sin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double sin(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return sin((float)x);
+    else
+        return __stdlib_sin(x);
+}
+
+static inline double cos(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return cos((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_cos(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double cos(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return cos((float)x);
+    else
+        return __stdlib_cos(x);
+}
+
+static inline void sincos(double x, reference double sin_result,
+                          reference double cos_result) {
+    if (__math_lib == __math_lib_ispc_fast) {
+        float sr, cr;
+        sincos((float)x, sr, cr);
+        sin_result = sr;
+        cos_result = cr;
+    }
+    else {
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            uniform double sr, cr;
+            if ((mask & (1 << i)) == 0)
+                continue;
+            __stdlib_sincos(extract(x, i), sr, cr);
+            sin_result = insert(sin_result, i, sr);
+            cos_result = insert(cos_result, i, cr);
+        }
+    }
+}
+
+static inline void sincos(uniform double x, reference uniform double sin_result,
+                          reference uniform double cos_result) {
+    if (__math_lib == __math_lib_ispc_fast) {
+        uniform float sr, cr;
+        sincos((uniform float)x, sr, cr);
+        sin_result = sr;
+        cos_result = cr;
+    }
+    else
+        __stdlib_sincos(x, sin_result, cos_result);
+}
+
+static inline double tan(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return tan((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_tan(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double tan(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return tan((float)x);
+    else
+        return __stdlib_tan(x);
+}
+
+static inline double atan(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_atan(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double atan(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan((float)x);
+    else
+        return __stdlib_atan(x);
+}
+
+static inline double atan2(double y, double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan2((float)y, (float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double atan2(uniform double y, uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan2((float)y, (float)x);
+    else
+        return __stdlib_atan2(y, x);
+}
+
+static inline double exp(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return exp((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_exp(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double exp(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return exp((float)x);
+    else
+        return __stdlib_exp(x);
+}
+
+static inline double log(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return log((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_log(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double log(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return log((float)x);
+    else
+        return __stdlib_log(x);
+}
+
+static inline double pow(double a, double b) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return pow((float)a, (float)b);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double pow(uniform double a, uniform double b) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return pow((float)a, (float)b);
+    else
+        return __stdlib_pow(a, b);
+}
 
 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
@@ -1709,7 +2183,7 @@ struct RNGState {
     unsigned int z1, z2, z3, z4;
 };
 
-static inline unsigned int random(reference uniform RNGState state)
+static inline unsigned int random(reference RNGState state)
 {
     unsigned int b;
 
@@ -1724,14 +2198,14 @@ static inline unsigned int random(reference uniform RNGState state)
     return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4);
 }
 
-static inline float frandom(reference uniform RNGState state)
+static inline float frandom(reference RNGState state)
 {
     return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24);
 }
 
-static inline uniform unsigned int __seed4(reference uniform RNGState state, 
-                                    uniform int start,
-                                    uniform unsigned int seed) {
+static inline uniform unsigned int __seed4(reference RNGState state, 
+                                           uniform int start,
+                                           uniform unsigned int seed) {
     uniform unsigned int c1 = 0xf0f0f0f0;
     uniform unsigned int c2 = 0x0f0f0f0f;
 
diff --git a/stdlib.m4 b/stdlib.m4
index 385d19e0..6b781b17 100644
--- a/stdlib.m4
+++ b/stdlib.m4
@@ -136,6 +136,26 @@ define(`reduce8by4', `
 )
 
 
+;; Apply a unary function to the 4-vector in %0, return the vector result.
+;; $1: scalar type of result
+;; $2: name of scalar function to call
+
+define(`unary1to4', `
+  %v_0 = extractelement <4 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <4 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <4 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <4 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3
+  ret <4 x $1> %ret_3
+')
+
 ;; Given a unary function that takes a 2-wide vector and a 4-wide vector
 ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
 ;; vector, apply it, and return the corresponding 4-wide vector result
@@ -286,6 +306,49 @@ ret <8 x float> %ret
 '
 )
 
+define(`round4to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%ret = shufflevector <4 x double> %r0, <4 x double> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+; and similarly for doubles...
+
+define(`round2to4double', `
+%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%ret = shufflevector <2 x double> %r0, <2 x double> %r1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ret <4 x double> %ret
+'
+)
+
+define(`round2to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2)
+%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2)
+%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; forloop macro
 
@@ -503,15 +566,26 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops
 
-define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x float> %0, i32 %1
-  ret float %extract
+define internal i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i32> %0, i32 %1
+  ret i32 %extract
 }
 
-define internal <$1 x float> @__insert(<$1 x float>, i32, 
-                                       float) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x float> %0, float %2, i32 %1
-  ret <$1 x float> %insert
+define internal <$1 x i32> @__insert_int32(<$1 x i32>, i32, 
+                                           i32) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1
+  ret <$1 x i32> %insert
+}
+
+define internal i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i64> %0, i32 %1
+  ret i64 %extract
+}
+
+define internal <$1 x i64> @__insert_int64(<$1 x i64>, i32, 
+                                           i64) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1
+  ret <$1 x i64> %insert
 }
 
 shuffles($1, float, float, 4)
@@ -588,51 +662,106 @@ declare float @expf(float) nounwind readnone
 declare float @logf(float) nounwind readnone
 declare float @powf(float, float) nounwind readnone
 
-define internal float @__stdlib_sin(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_sinf(float) nounwind readnone alwaysinline {
   %r = call float @sinf(float %0)
   ret float %r
 }
 
-define internal float @__stdlib_cos(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_cosf(float) nounwind readnone alwaysinline {
   %r = call float @cosf(float %0)
   ret float %r
 }
 
-define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline {
+define internal void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline {
   call void @sincosf(float %0, float *%1, float *%2)
   ret void
 }
 
-define internal float @__stdlib_tan(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_tanf(float) nounwind readnone alwaysinline {
   %r = call float @tanf(float %0)
   ret float %r
 }
 
-define internal float @__stdlib_atan(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_atanf(float) nounwind readnone alwaysinline {
   %r = call float @atanf(float %0)
   ret float %r
 }
 
-define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline {
+define internal float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline {
   %r = call float @atan2f(float %0, float %1)
   ret float %r
 }
 
-define internal float @__stdlib_log(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_logf(float) nounwind readnone alwaysinline {
   %r = call float @logf(float %0)
   ret float %r
 }
 
-define internal float @__stdlib_exp(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_expf(float) nounwind readnone alwaysinline {
   %r = call float @expf(float %0)
   ret float %r
 }
 
-define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline {
+define internal float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
   %r = call float @powf(float %0, float %1)
   ret float %r
 }
 
+declare double @sin(double) nounwind readnone
+declare double @cos(double) nounwind readnone
+declare void @sincos(double, double *, double *) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @atan2(double, double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @pow(double, double) nounwind readnone
+
+define internal double @__stdlib_sin(double) nounwind readnone alwaysinline {
+  %r = call double @sin(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_cos(double) nounwind readnone alwaysinline {
+  %r = call double @cos(double %0)
+  ret double %r
+}
+
+define internal void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline {
+  call void @sincos(double %0, double *%1, double *%2)
+  ret void
+}
+
+define internal double @__stdlib_tan(double) nounwind readnone alwaysinline {
+  %r = call double @tan(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_atan(double) nounwind readnone alwaysinline {
+  %r = call double @atan(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_atan2(double, double) nounwind readnone alwaysinline {
+  %r = call double @atan2(double %0, double %1)
+  ret double %r
+}
+
+define internal double @__stdlib_log(double) nounwind readnone alwaysinline {
+  %r = call double @log(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_exp(double) nounwind readnone alwaysinline {
+  %r = call double @exp(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
+  %r = call double @pow(double %0, double %1)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; atomics and memory barriers
 
@@ -676,6 +805,52 @@ global_atomic_exchange($1, i64, int64)
 
 ')
 
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 64-bit integer min and max functions
+
+;; utility function used by int64minmax below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: target vector width
+;; $2: {min,max} (used in constructing function names)
+;; $3: {int64,uint64} (used in constructing function names)
+;; $4: {slt,sgt} comparison operator to used
+
+define(`i64minmax', `
+define internal i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone {
+  %c = icmp $4 i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define internal <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone {
+  %rptr = alloca <$1 x i64>
+  %r64ptr = bitcast <$1 x i64> * %rptr to i64 *
+
+  forloop(i, 0, eval($1-1), `
+  %v0_`'i = extractelement <$1 x i64> %0, i32 i
+  %v1_`'i = extractelement <$1 x i64> %1, i32 i
+  %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i
+  %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i
+  %ptr_`'i = getelementptr i64 * %r64ptr, i32 i
+  store i64 %v_`'i, i64 * %ptr_`'i
+')                  
+
+  %ret = load <$1 x i64> * %rptr
+  ret <$1 x i64> %ret
+}
+')
+
+;; this is the function that target .ll files should call; it just takes the target
+;; vector width as a parameter
+
+define(`int64minmax', `
+i64minmax($1,min,int64,slt)
+i64minmax($1,max,int64,sgt)
+i64minmax($1,min,uint64,ult)
+i64minmax($1,max,uint64,ugt)
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Definitions of 8 and 16-bit load and store functions
 ;;
diff --git a/tests/double-abs-1.ispc b/tests/double-abs-1.ispc
new file mode 100644
index 00000000..f7a2ff28
--- /dev/null
+++ b/tests/double-abs-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex]; 
+    RET[programIndex] = abs(-a);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/double-abs.ispc b/tests/double-abs.ispc
new file mode 100644
index 00000000..1ea03678
--- /dev/null
+++ b/tests/double-abs.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex]; 
+    RET[programIndex] = abs(a);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/double-max-1.ispc b/tests/double-max-1.ispc
new file mode 100644
index 00000000..74ebce2c
--- /dev/null
+++ b/tests/double-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = -2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 * (1 + programIndex);
+}
diff --git a/tests/double-max.ispc b/tests/double-max.ispc
new file mode 100644
index 00000000..9f083fa7
--- /dev/null
+++ b/tests/double-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
diff --git a/tests/double-min-1.ispc b/tests/double-min-1.ispc
new file mode 100644
index 00000000..ffe373cb
--- /dev/null
+++ b/tests/double-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = -2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -2 * (1 + programIndex);
+}
diff --git a/tests/double-min.ispc b/tests/double-min.ispc
new file mode 100644
index 00000000..664d410d
--- /dev/null
+++ b/tests/double-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/double-sqrt.ispc b/tests/double-sqrt.ispc
new file mode 100644
index 00000000..41cfa8b6
--- /dev/null
+++ b/tests/double-sqrt.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    if (programIndex & 1) {
+        a *= a;
+        RET[programIndex] = sqrt(a);
+    }
+    else
+        RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/extract-1.ispc b/tests/extract-1.ispc
new file mode 100644
index 00000000..220107cb
--- /dev/null
+++ b/tests/extract-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = programIndex;
+    RET[programIndex] = extract(a, 3); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc
new file mode 100644
index 00000000..96890dd9
--- /dev/null
+++ b/tests/frexp-double-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    frexp(a, exponent);
+    RET[programIndex] = exponent;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/frexp-double.ispc b/tests/frexp-double.ispc
new file mode 100644
index 00000000..1c95bd28
--- /dev/null
+++ b/tests/frexp-double.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    RET[programIndex] = frexp(a, exponent);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.75;
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
diff --git a/tests/frexp-float-1.ispc b/tests/frexp-float-1.ispc
new file mode 100644
index 00000000..ae59bbf5
--- /dev/null
+++ b/tests/frexp-float-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    frexp(a, exponent);
+    RET[programIndex] = exponent;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
diff --git a/tests/frexp-float.ispc b/tests/frexp-float.ispc
new file mode 100644
index 00000000..bebba0f4
--- /dev/null
+++ b/tests/frexp-float.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    RET[programIndex] = frexp(a, exponent);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.75;
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
diff --git a/tests/insert-1.ispc b/tests/insert-1.ispc
new file mode 100644
index 00000000..0a3e8731
--- /dev/null
+++ b/tests/insert-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a;
+    for (uniform int i = 0; i < programCount; ++i)
+        a = insert(a, i, (int64)i);
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
diff --git a/tests/insert-2.ispc b/tests/insert-2.ispc
new file mode 100644
index 00000000..a75c4b91
--- /dev/null
+++ b/tests/insert-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a;
+    for (uniform int i = 0; i < programCount; ++i)
+        a = insert(a, i, (double)i+10);
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex+10;
+}
diff --git a/tests/int64-max-1.ispc b/tests/int64-max-1.ispc
new file mode 100644
index 00000000..83f22e1b
--- /dev/null
+++ b/tests/int64-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = -2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 * (1 + programIndex);
+}
diff --git a/tests/int64-max.ispc b/tests/int64-max.ispc
new file mode 100644
index 00000000..b4b5e6d3
--- /dev/null
+++ b/tests/int64-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
diff --git a/tests/int64-min-1.ispc b/tests/int64-min-1.ispc
new file mode 100644
index 00000000..71db86bd
--- /dev/null
+++ b/tests/int64-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = -2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -2 * (1 + programIndex);
+}
diff --git a/tests/int64-min.ispc b/tests/int64-min.ispc
new file mode 100644
index 00000000..da67fe6a
--- /dev/null
+++ b/tests/int64-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
diff --git a/tests/ldexp-double.ispc b/tests/ldexp-double.ispc
new file mode 100644
index 00000000..f2454826
--- /dev/null
+++ b/tests/ldexp-double.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = 1 << (programIndex);
+    if (programIndex & 1)
+        a = -a;
+    RET[programIndex] = ldexp(a, 2);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 << (programIndex + 2));
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
diff --git a/tests/ldexp-float.ispc b/tests/ldexp-float.ispc
new file mode 100644
index 00000000..aa6b5a0f
--- /dev/null
+++ b/tests/ldexp-float.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = 1 << (programIndex);
+    if (programIndex & 1)
+        a = -a;
+    RET[programIndex] = ldexp(a, 2);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 << (programIndex + 2));
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
diff --git a/tests/popcnt-4.ispc b/tests/popcnt-4.ispc
new file mode 100644
index 00000000..16a6365c
--- /dev/null
+++ b/tests/popcnt-4.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[4], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    a = (a < 3) ? 1 : 0;
+    if ((programIndex & 1) != 0)
+        a |= ((int64)1 << 36);
+    RET[programIndex] = popcnt(a);
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 0;
+    if (programIndex & 1)
+        ++RET[programIndex];
+    if (programIndex < 2)
+        ++RET[programIndex];
+}
+
diff --git a/tests/reduce-add-double-1.ispc b/tests/reduce-add-double-1.ispc
new file mode 100644
index 00000000..08da8ea7
--- /dev/null
+++ b/tests/reduce-add-double-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    if (iv & 1)
+        m = reduce_add((double)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-double-2.ispc b/tests/reduce-add-double-2.ispc
new file mode 100644
index 00000000..4ffc6ddf
--- /dev/null
+++ b/tests/reduce-add-double-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    m = reduce_add((double)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 124;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-double.ispc b/tests/reduce-add-double.ispc
new file mode 100644
index 00000000..350bf191
--- /dev/null
+++ b/tests/reduce-add-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_add(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-add-int64-1.ispc b/tests/reduce-add-int64-1.ispc
new file mode 100644
index 00000000..e60ca791
--- /dev/null
+++ b/tests/reduce-add-int64-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int64 iv = (int64)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-int64.ispc b/tests/reduce-add-int64.ispc
new file mode 100644
index 00000000..5ef3dfee
--- /dev/null
+++ b/tests/reduce-add-int64.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int64 iv = (int)v;
+    m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 136;
+    RET[programIndex] = x;
+}
+
diff --git a/tests/reduce-add-uint64-1.ispc b/tests/reduce-add-uint64-1.ispc
new file mode 100644
index 00000000..d33170f5
--- /dev/null
+++ b/tests/reduce-add-uint64-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    unsigned int64 iv = (unsigned int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
diff --git a/tests/reduce-add-uint64.ispc b/tests/reduce-add-uint64.ispc
new file mode 100644
index 00000000..e637878a
--- /dev/null
+++ b/tests/reduce-add-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex & 0x3];
+    uniform float m;
+    unsigned int64 iv = (unsigned int)v;
+    m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10 * programCount/4; }
diff --git a/tests/reduce-max-double.ispc b/tests/reduce-max-double.ispc
new file mode 100644
index 00000000..672987ff
--- /dev/null
+++ b/tests/reduce-max-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-max-int64.ispc b/tests/reduce-max-int64.ispc
new file mode 100644
index 00000000..08641a9a
--- /dev/null
+++ b/tests/reduce-max-int64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-(int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
diff --git a/tests/reduce-max-uint64.ispc b/tests/reduce-max-uint64.ispc
new file mode 100644
index 00000000..ce5e52d4
--- /dev/null
+++ b/tests/reduce-max-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_max((unsigned int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
diff --git a/tests/reduce-min-double.ispc b/tests/reduce-min-double.ispc
new file mode 100644
index 00000000..866aa57e
--- /dev/null
+++ b/tests/reduce-min-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v > 0 && v < 3)
+        m = reduce_min(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -2; }
diff --git a/tests/reduce-min-int.ispc b/tests/reduce-min-int.ispc
index 4f7bf0b8..b943f323 100644
--- a/tests/reduce-min-int.ispc
+++ b/tests/reduce-min-int.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
     float v = aFOO[programIndex];
     uniform float m;
     if (v >= 0 && v < 2)
-        m = reduce_min(-v);
+        m = reduce_min(-(int)v);
     RET[programIndex] = m;
 }
 
diff --git a/tests/reduce-min-int64.ispc b/tests/reduce-min-int64.ispc
new file mode 100644
index 00000000..0fa74e61
--- /dev/null
+++ b/tests/reduce-min-int64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 0 && v < 2)
+        m = reduce_min(-(int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -1; }
diff --git a/tests/reduce-min-uint64.ispc b/tests/reduce-min-uint64.ispc
new file mode 100644
index 00000000..2290dfd6
--- /dev/null
+++ b/tests/reduce-min-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_min((unsigned int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 3; }
diff --git a/tests/uint64-max-1.ispc b/tests/uint64-max-1.ispc
new file mode 100644
index 00000000..086fc5ff
--- /dev/null
+++ b/tests/uint64-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = max(2, 1 + programIndex);
+}
diff --git a/tests/uint64-max.ispc b/tests/uint64-max.ispc
new file mode 100644
index 00000000..c765ef7f
--- /dev/null
+++ b/tests/uint64-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
diff --git a/tests/uint64-min-1.ispc b/tests/uint64-min-1.ispc
new file mode 100644
index 00000000..750098d2
--- /dev/null
+++ b/tests/uint64-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2 * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex);
+}
diff --git a/tests/uint64-min.ispc b/tests/uint64-min.ispc
new file mode 100644
index 00000000..254846eb
--- /dev/null
+++ b/tests/uint64-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}