Finish support for 64-bit types in stdlib. Fixes issue #14.

Add much more suppport for doubles and in64 types in the standard library, basically supporting everything for them that are supported for floats and int32s. (The notable exceptions being the approximate rcp() and rsqrt() functions, which don't really have sensible analogs for doubles (or at least not built-in instructions).)
2011-07-07 13:25:55 +01:00
parent f1aaf0115e
commit 5a53a43ed0
49 changed files with 1727 additions and 128 deletions
--- a/builtins.cpp
+++ b/builtins.cpp
@@ -104,6 +104,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                 AtomicType::UniformInt64, false);
    else if (t == LLVMTypes::FloatPointerType)
        return new ReferenceType(AtomicType::UniformFloat, false);
+    else if (t == LLVMTypes::DoublePointerType)
+        return new ReferenceType(AtomicType::UniformDouble, false);
    else if (t == LLVMTypes::Int32VectorPointerType)
        return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 :
                                                 AtomicType::VaryingInt32, false);
@@ -112,6 +114,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) {
                                                 AtomicType::VaryingInt64, false);
    else if (t == LLVMTypes::FloatVectorPointerType)
        return new ReferenceType(AtomicType::VaryingFloat, false);
+    else if (t == LLVMTypes::DoubleVectorPointerType)
+        return new ReferenceType(AtomicType::VaryingDouble, false);
    else if (llvm::isa<const llvm::PointerType>(t)) {
        const llvm::PointerType *pt = llvm::dyn_cast<const llvm::PointerType>(t);

--- a/llvmutil.cpp
+++ b/llvmutil.cpp
@@ -50,6 +50,7 @@ const llvm::Type *LLVMTypes::Int64PointerType = NULL;
 const llvm::Type *LLVMTypes::FloatType = NULL;
 const llvm::Type *LLVMTypes::FloatPointerType = NULL;
 const llvm::Type *LLVMTypes::DoubleType = NULL;
+const llvm::Type *LLVMTypes::DoublePointerType = NULL;

 const llvm::VectorType *LLVMTypes::MaskType = NULL;
 const llvm::VectorType *LLVMTypes::BoolVectorType = NULL;
@@ -61,6 +62,7 @@ const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL;
 const llvm::VectorType *LLVMTypes::FloatVectorType = NULL;
 const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL;
 const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL;
+const llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL;
 const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL;

 llvm::Constant *LLVMTrue = NULL;
@@ -83,6 +85,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx);
    LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0);
    LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx);
+    LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0);

    // Note that both the mask and bool vectors are vector of int32s
    // (not i1s).  LLVM ends up generating much better SSE code with
@@ -103,6 +106,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) {
    LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0);
    LLVMTypes::DoubleVectorType = 
        llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth);
+    LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0);
    LLVMTypes::VoidPointerVectorType = 
        llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth);

--- a/llvmutil.h
+++ b/llvmutil.h
@@ -61,6 +61,7 @@ struct LLVMTypes {
    static const llvm::Type *FloatType;
    static const llvm::Type *FloatPointerType;
    static const llvm::Type *DoubleType;
+    static const llvm::Type *DoublePointerType;

    static const llvm::VectorType *MaskType;
    static const llvm::VectorType *BoolVectorType;
@@ -72,6 +73,7 @@ struct LLVMTypes {
    static const llvm::VectorType *FloatVectorType;
    static const llvm::Type *FloatVectorPointerType;
    static const llvm::VectorType *DoubleVectorType;
+    static const llvm::Type *DoubleVectorPointerType;
    static const llvm::ArrayType *VoidPointerVectorType;
 };

--- a/stdlib-avx.ll
+++ b/stdlib-avx.ll
@@ -42,6 +42,7 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
+int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -77,7 +78,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; rounding
+;; rounding floats

 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -141,6 +142,56 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ret float %rs
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round4to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round4to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round4to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rsqrt
+
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone

@@ -318,11 +369,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone

 define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline {
@@ -403,6 +461,81 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
 }


+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal double ops
+
+declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline {
+  %v0 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %v1 = shufflevector <8 x double> %0, <8 x double> undef,
+                      <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1)
+  %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0)
+  %scalar1 = extractelement <4 x double> %sum0, i32 0
+  %scalar2 = extractelement <4 x double> %sum1, i32 1
+  %sum = fadd double %scalar1, %scalar2
+  ret double %sum
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; horizontal int64 ops
+
+define internal <8 x i64> @__add_varying_int64(<8 x i64>,
+                                               <8 x i64>) nounwind readnone alwaysinline {
+  %s = add <8 x i64> %0, %1
+  ret <8 x i64> %s
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %s = add i64 %0, %1
+  ret i64 %s
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;; horizontal uint64 ops
+
+define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline {
+  %r = call i64 @__reduce_add_int64(<8 x i64> %v)
+  ret i64 %r
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; unaligned loads/loads+broadcasts

--- a/stdlib-sse.ll
+++ b/stdlib-sse.ll
@@ -37,6 +37,7 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 int8_16(4)
+int64minmax(4)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
  ret float %ret
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision sqrt
+
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
+
+define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
+  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
+  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
+  ret double %ret
+}
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; double precision min/max
+
+declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
+declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__min_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
+  ret double %ret
+}
+
+
+define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
+  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
+  ret <4 x double> %ret
+}
+
+
+define internal double @__max_uniform_double(double, double) nounwind readnone {
+  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
+  ret double %ret
+}

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions
@@ -279,6 +328,55 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone {
 }


+define internal double @__reduce_add_double(<4 x double>) nounwind readnone {
+  %v0 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x double> %0, <4 x double> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = fadd <2 x double> %v0, %v1
+  %e0 = extractelement <2 x double> %sum, i32 0
+  %e1 = extractelement <2 x double> %sum, i32 1
+  %m = fadd double %e0, %e1
+  ret double %m
+}
+
+define internal double @__reduce_min_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<4 x double>) nounwind readnone {
+  reduce4(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone {
+  %v0 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 0, i32 1>
+  %v1 = shufflevector <4 x i64> %0, <4 x i64> undef,
+                      <2 x i32> <i32 2, i32 3>
+  %sum = add <2 x i64> %v0, %v1
+  %e0 = extractelement <2 x i64> %sum, i32 0
+  %e1 = extractelement <2 x i64> %sum, i32 1
+  %m = add i64 %e0, %e1
+  ret i64 %m
+}
+
+define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone {
+  reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

@@ -389,53 +487,3 @@ gen_gather(4, i32)
 gen_gather(4, i64)
 gen_scatter(4, i32)
 gen_scatter(4, i64)
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision sqrt
-
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline {
-  unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__sqrt_uniform_double(double) nounwind alwaysinline {
-  sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
-  ret double %ret
-}
-
-
-;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; double precision min/max
-
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__min_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
-  ret double %ret
-}
-
-
-define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone {
-  binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1)
-  ret <4 x double> %ret
-}
-
-
-define internal double @__max_uniform_double(double, double) nounwind readnone {
-  sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
-  ret double %ret
-}
--- a/stdlib-sse2.ll
+++ b/stdlib-sse2.ll
@@ -152,6 +152,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ret float %binop.i
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare double @round(double)
+declare double @floor(double)
+declare double @ceil(double)
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @round)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @round(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @floor)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @floor(double %0)
+  ret double %r
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  unary1to4(double, @ceil)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  %r = call double @ceil(double %0)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; min/max

@@ -252,7 +286,7 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli
 ; it does generate non-POPCNT code and in particular better code than
 ; the below does.)

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
 entry:
  br label %loop

@@ -269,6 +303,16 @@ exit:
  ret i32 %newcount
 }

+define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline {
+  %vec = bitcast i64 %0 to <2 x i32>
+  %v0 = extractelement <2 x i32> %vec, i32 0
+  %v1 = extractelement <2 x i32> %vec, i32 1
+  %c0 = call i32 @__popcnt_int32(i32 %v0)
+  %c1 = call i32 @__popcnt_int32(i32 %v1)
+  %sum = add i32 %c0, %c1
+  ret i32 %sum
+}
+

 define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
  %v1 = shufflevector <4 x float> %v, <4 x float> undef,
--- a/stdlib-sse4.ll
+++ b/stdlib-sse4.ll
@@ -40,7 +40,7 @@ packed_load_and_store(4)
 include(`stdlib-sse.ll')

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; rounding floats

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -106,7 +106,52 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; integer min/max
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  round2to4double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round2to4double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round2to4double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -163,11 +208,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {
--- a/stdlib-sse4x2.ll
+++ b/stdlib-sse4x2.ll
@@ -39,6 +39,7 @@
 stdlib_core(8)
 packed_load_and_store(8)
 int8_16(8)
+int64minmax(8)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; rcp
@@ -258,7 +259,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway
 }

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; int min/max
+;; int32 min/max

 declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -380,6 +381,60 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli
  reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32)
 }

+define internal <4 x double> @__add_varying_double(<4 x double>,
+                                     <4 x double>) nounwind readnone alwaysinline {
+  %r = fadd <4 x double> %0, %1
+  ret <4 x double> %r
+}
+
+define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline {
+  %r = fadd double %0, %1
+  ret double %r
+}
+
+define internal double @__reduce_add_double(<8 x double>) nounwind readnone {
+  reduce8by4(double, @__add_varying_double, @__add_uniform_double)
+}
+
+define internal double @__reduce_min_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__min_varying_double, @__min_uniform_double)
+}
+
+define internal double @__reduce_max_double(<8 x double>) nounwind readnone {
+  reduce8(double, @__max_varying_double, @__max_uniform_double)
+}
+
+define internal <4 x i64> @__add_varying_int64(<4 x i64>,
+                                               <4 x i64>) nounwind readnone alwaysinline {
+  %r = add <4 x i64> %0, %1
+  ret <4 x i64> %r
+}
+
+define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline {
+  %r = add i64 %0, %1
+  ret i64 %r
+}
+
+define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone {
+  reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64)
+}
+
+define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_int64, @__min_uniform_int64)
+}
+
+define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_int64, @__max_uniform_int64)
+}
+
+define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
+}
+
+define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone {
+  reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64)
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; masked store

@@ -465,7 +520,7 @@ gen_scatter(8, i32)
 gen_scatter(8, i64)

 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-;; math
+;; float rounding

 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
@@ -526,16 +581,68 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin
  ret float %rs
 }

+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; rounding doubles
+
+declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
+declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
+
+define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  round2to8double(%0, 8)
+}
+
+define internal double @__round_uniform_double(double) nounwind readonly alwaysinline {
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  round2to8double(%0, 9)
+}
+
+define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
+define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline {
+  ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  round2to8double(%0, 10)
+}
+
+define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
+  ; see above for round_ss instrinsic discussion...
+  %xi = insertelement <2 x double> undef, double %0, i32 0
+  ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10
+  %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
+  %rs = extractelement <2 x double> %xr, i32 0
+  ret double %rs
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ; horizontal ops / reductions

 declare i32 @llvm.ctpop.i32(i32) nounwind readnone

-define internal i32 @__popcnt(i32) nounwind readonly alwaysinline {
+define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline {
  %call = call i32 @llvm.ctpop.i32(i32 %0)
  ret i32 %call
 }

+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
+
+define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
+  %call = call i64 @llvm.ctpop.i64(i64 %0)
+  ret i64 %call
+}
+
 declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone

 define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
--- a/stdlib.ispc
+++ b/stdlib.ispc
@@ -147,30 +147,57 @@ static inline int64 shuffle(int64 v0, int64 v1, int i) {

 // x[i]
 static inline uniform float extract(float x, uniform int i) {
-    return __extract(x, i);
+    return floatbits(__extract_int32((int)intbits(x), i));
+}
+
+static inline uniform int extract(int x, uniform int i) {
+    return __extract_int32(x, i);
+}
+
+static inline uniform unsigned int extract(unsigned int x, uniform int i) {
+    return __extract_int32(x, (unsigned int)i);
+}
+
+static inline uniform double extract(double x, uniform int i) {
+    return doublebits(__extract_int64((int64)intbits(x), i));
+}
+
+static inline uniform int64 extract(int64 x, uniform int i) {
+    return __extract_int64(x, i);
+}
+
+static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) {
+    return __extract_int64(x, (unsigned int)i);
 }

 // x[i] = v
 static inline float insert(float x, uniform int i, uniform float v) {
-    return __insert(x, i, v);
-}
-
-static inline uniform int extract(int x, uniform int i) {
-    return intbits(extract(floatbits(x), i));
+    return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v)));
 }

 static inline int insert(int x, uniform int i, uniform int v) {
-    return intbits(insert(floatbits(x), i, floatbits(v)));
+    return __insert_int32(x, i, v);
 }

-static inline uniform unsigned int extract(unsigned int x, uniform int i) {
-    return intbits(extract(floatbits(x), i));
+static inline unsigned int insert(unsigned int x, uniform int i, 
+                                  uniform unsigned int v) {
+    return __insert_int32(x, (unsigned int)i, v);
 }

-static inline unsigned int insert(unsigned int x, uniform int i, uniform unsigned int v) {
-    return intbits(insert(floatbits(x), i, floatbits(v)));
+static inline double insert(double x, uniform int i, uniform double v) {
+    return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v)));
 }

+static inline int64 insert(int64 x, uniform int i, uniform int64 v) {
+    return __insert_int64(x, i, v);
+}
+
+static inline unsigned int64 insert(unsigned int64 x, uniform int i, 
+                                    uniform unsigned int64 v) {
+    return __insert_int64(x, (unsigned int)i, v);
+}
+
+
 static inline uniform bool any(bool v) {
    // We only care about whether "any" is true for the active program instances,
    // so we have to make v with the current program mask.
@@ -185,20 +212,30 @@ static inline uniform bool all(bool v) {
 }

 static inline uniform int popcnt(uniform int v) {
-    return __popcnt(v);
+    return __popcnt_int32(v);
+}
+
+static inline uniform int popcnt(uniform int64 v) {
+    return (int32)__popcnt_int64(v);
 }

 static inline int popcnt(int v) {
    int r;
-    uniform int i;
-    for (i = 0; i < programCount; ++i)
+    for (uniform int i = 0; i < programCount; ++i)
+        r = insert(r, i, popcnt(extract(v, i)));
+    return (r & __mask);
+}
+
+static inline int popcnt(int64 v) {
+    int r;
+    for (uniform int i = 0; i < programCount; ++i)
        r = insert(r, i, popcnt(extract(v, i)));
    return (r & __mask);
 }

 static inline uniform int popcnt(bool v) {
    // As with any() and all(), only count across the active lanes
-    return __popcnt(__movmsk(v & __mask));
+    return __popcnt_int32(__movmsk(v & __mask));
 }

 static inline uniform int lanemask() {
@@ -270,6 +307,64 @@ static inline uniform unsigned int reduce_max(unsigned int v) {
    return __reduce_max_uint32(__mask ? v : 0);
 }

+
+static inline uniform double reduce_add(double x) {
+    // zero the lanes where the mask is off
+    return __reduce_add_double(__mask ? x : 0.);
+}
+
+static inline uniform double reduce_min(double v) {
+    int64 iflt_max = 0x7ff0000000000000; // infinity
+    // Must use __doublebits_varying_int64, not doublebits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max));
+}
+
+static inline uniform double reduce_max(double v) {
+    const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity
+    // Must use __doublebits_varying_int64, not doublebits(), since with the
+    // latter the current mask enters into the returned result...
+    return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max));
+}
+
+static inline uniform int64 reduce_add(int64 x) {
+    // Zero out the values for lanes that aren't running
+    return __reduce_add_int64(x & (int64)__mask);
+}
+
+static inline uniform int64 reduce_min(int64 v) {
+    // Set values for non-running lanes to the maximum integer value so
+    // they don't affect the result.
+    int64 int_max = 0x7fffffffffffffff;
+    return __reduce_min_int64(__mask ? v : int_max);
+}
+
+static inline uniform int64 reduce_max(int64 v) {
+    // Set values for non-running lanes to the minimum integer value so
+    // they don't affect the result.
+    int64 int_min = 0x8000000000000000;
+    return __reduce_max_int64(__mask ? v : int_min);
+}
+
+static inline uniform unsigned int64 reduce_add(unsigned int64 x) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_add_int64(x & (int64)__mask);
+}
+
+static inline uniform unsigned int64 reduce_min(unsigned int64 v) {
+    // Set values for non-running lanes to the maximum unsigned integer
+    // value so they don't affect the result.
+    unsigned int64 uint_max = 0xffffffffffffffff;
+    return __reduce_min_uint64(__mask ? v : uint_max);
+}
+
+static inline uniform unsigned int64 reduce_max(unsigned int64 v) {
+    // Set values for non-running lanes to zero so they don't affect the
+    // result.
+    return __reduce_max_uint64(__mask ? v : 0);
+}
+
 ///////////////////////////////////////////////////////////////////////////
 // packed load, store

@@ -425,14 +520,37 @@ static inline uniform float abs(uniform float a) {
    return floatbits(i);
 }

+static inline double abs(double a) {
+    // zeroing the high bit clears the sign
+    unsigned int64 i = intbits(a);
+    i &= 0x7fffffffffffffff;
+    return doublebits(i);
+}
+
+static inline uniform double abs(uniform double a) {
+    uniform unsigned int64 i = intbits(a);
+    i &= 0x7fffffffffffffff;
+    return doublebits(i);
+}
+
 static inline unsigned int signbits(float x) {
    unsigned int i = intbits(x);
-    return (i & 0x80000000u);
+    return (i & 0x80000000);
 }

 static inline uniform unsigned int signbits(uniform float x) {
    uniform unsigned int i = intbits(x);
-    return (i & 0x80000000u);
+    return (i & 0x80000000);
+}
+
+static inline unsigned int64 signbits(double x) {
+    unsigned int64 i = intbits(x);
+    return (i & 0x8000000000000000);
+}
+
+static inline uniform unsigned int64 signbits(uniform double x) {
+    uniform unsigned int64 i = intbits(x);
+    return (i & 0x8000000000000000);
 }

 static inline float round(float x) {
@@ -443,6 +561,14 @@ static inline uniform float round(uniform float x) {
    return __round_uniform_float(x);
 }

+static inline double round(double x) {
+    return __round_varying_double(x);
+}
+
+static inline uniform double round(uniform double x) {
+    return __round_uniform_double(x);
+}
+
 static inline float floor(float x) {
    return __floor_varying_float(x);
 }
@@ -451,6 +577,14 @@ static inline uniform float floor(uniform float x) {
    return __floor_uniform_float(x);
 }

+static inline double floor(double x) {
+    return __floor_varying_double(x);
+}
+
+static inline uniform double floor(uniform double x) {
+    return __floor_uniform_double(x);
+}
+
 static inline float ceil(float x) {
    return __ceil_varying_float(x);
 }
@@ -459,6 +593,14 @@ static inline uniform float ceil(uniform float x) {
    return __ceil_uniform_float(x);
 }

+static inline double ceil(double x) {
+    return __ceil_varying_double(x);
+}
+
+static inline uniform double ceil(uniform double x) {
+    return __ceil_uniform_double(x);
+}
+
 static inline float rcp(float v) {
    return __rcp_varying_float(v);
 }
@@ -467,14 +609,6 @@ static inline uniform float rcp(uniform float v) {
    return __rcp_uniform_float(v);
 }

-static inline float sqrt(float v) {
-    return __sqrt_varying_float(v);
-}
-
-static inline uniform float sqrt(uniform float v) {
-    return __sqrt_uniform_float(v);
-}
-
 static inline float min(float a, float b) {
    return __min_varying_float(a, b);
 }
@@ -483,6 +617,14 @@ static inline uniform float min(uniform float a, uniform float b) {
    return __min_uniform_float(a, b);
 }

+static inline double min(double a, double b) {
+    return __min_varying_double(a, b);
+}
+
+static inline uniform double min(uniform double a, uniform double b) {
+    return __min_uniform_double(a, b);
+}
+
 static inline float max(float a, float b) {
    return __max_varying_float(a, b);
 }
@@ -491,6 +633,14 @@ static inline uniform float max(uniform float a, uniform float b) {
    return __max_uniform_float(a, b);
 }

+static inline double max(double a, double b) {
+    return __max_varying_double(a, b);
+}
+
+static inline uniform double max(uniform double a, uniform double b) {
+    return __max_uniform_double(a, b);
+}
+
 static inline unsigned int min(unsigned int a, unsigned int b) {
    return __min_varying_uint32(a, b);
 }
@@ -523,6 +673,38 @@ static inline uniform int max(uniform int a, uniform int b) {
    return __max_uniform_int32(a, b);
 }

+static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) {
+    return __min_varying_uint64(a, b);
+}
+
+static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) {
+    return __min_uniform_uint64(a, b);
+}
+
+static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) {
+    return __max_varying_uint64(a, b);
+}
+
+static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) {
+    return __max_uniform_uint64(a, b);
+}
+
+static inline int64 min(int64 a, int64 b) {
+    return __min_varying_int64(a, b);
+}
+
+static inline uniform int64 min(uniform int64 a, uniform int64 b) {
+    return __min_uniform_int64(a, b);
+}
+
+static inline int64 max(int64 a, int64 b) {
+    return __max_varying_int64(a, b);
+}
+
+static inline uniform int64 max(uniform int64 a, uniform int64 b) {
+    return __max_uniform_int64(a, b);
+}
+
 static inline float clamp(float v, float low, float high) {
    return min(max(v, low), high);
 }
@@ -536,7 +718,16 @@ static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int
 }

 static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low, 
-                                  uniform unsigned int high) {
+                                         uniform unsigned int high) {
+    return min(max(v, low), high);
+}
+
+static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, unsigned int64 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform unsigned int64 clamp(uniform unsigned int64 v, uniform unsigned int64 low, 
+                                           uniform unsigned int64 high) {
    return min(max(v, low), high);
 }

@@ -548,8 +739,24 @@ static inline uniform int clamp(uniform int v, uniform int low, uniform int high
    return min(max(v, low), high);
 }

+static inline int64 clamp(int64 v, int64 low, int64 high) {
+    return min(max(v, low), high);
+}
+
+static inline uniform int64 clamp(uniform int64 v, uniform int64 low, uniform int64 high) {
+    return min(max(v, low), high);
+}
+
 ///////////////////////////////////////////////////////////////////////////
-// Transcendentals
+// Transcendentals (float precision)
+
+static inline float sqrt(float v) {
+    return __sqrt_varying_float(v);
+}
+
+static inline uniform float sqrt(uniform float v) {
+    return __sqrt_uniform_float(v);
+}

 static inline float rsqrt(float v) {
    return __rsqrt_varying_float(v);
@@ -612,7 +819,7 @@ static inline float sin(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_sin(extract(x_full, i));
+            uniform float r = __stdlib_sinf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -669,7 +876,7 @@ static inline float sin(float x_full) {
 static inline uniform float sin(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_sin(x_full);
+        return __stdlib_sinf(x_full);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -741,7 +948,7 @@ static inline float cos(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_cos(extract(x_full, i));
+            uniform float r = __stdlib_cosf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -797,7 +1004,7 @@ static inline float cos(float x_full) {
 static inline uniform float cos(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_cos(x_full);
+        return __stdlib_cosf(x_full);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -868,7 +1075,7 @@ static inline void sincos(float x_full, reference float sin_result, reference fl
            if ((mask & (1 << i)) == 0)
                continue;
            uniform float s, c;
-            __stdlib_sincos(extract(x_full, i), s, c);
+            __stdlib_sincosf(extract(x_full, i), s, c);
            sin_result = insert(sin_result, i, s);
            cos_result = insert(cos_result, i, c);
        }
@@ -930,10 +1137,10 @@ static inline void sincos(float x_full, reference float sin_result, reference fl


 static inline void sincos(uniform float x_full, reference uniform float sin_result,
-                   reference uniform float cos_result) {
+                          reference uniform float cos_result) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        __stdlib_sincos(x_full, sin_result, cos_result);
+        __stdlib_sincosf(x_full, sin_result, cos_result);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -1001,7 +1208,7 @@ static inline float tan(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_tan(extract(x_full, i));
+            uniform float r = __stdlib_tanf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1075,7 +1282,7 @@ static inline float tan(float x_full) {
 static inline uniform float tan(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_tan(x_full);
+        return __stdlib_tanf(x_full);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -1153,7 +1360,7 @@ static inline float atan(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_atan(extract(x_full, i));
+            uniform float r = __stdlib_atanf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1199,7 +1406,7 @@ static inline float atan(float x_full) {
 static inline uniform float atan(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_atan(x_full);
+        return __stdlib_atanf(x_full);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -1249,7 +1456,7 @@ static inline float atan2(float y, float x) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_atan2(extract(y, i), extract(x, i));
+            uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1283,7 +1490,7 @@ static inline float atan2(float y, float x) {
 static inline uniform float atan2(uniform float y, uniform float x) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_atan2(y, x);
+        return __stdlib_atan2f(y, x);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -1310,7 +1517,7 @@ static inline float exp(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_exp(extract(x_full, i));
+            uniform float r = __stdlib_expf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1384,7 +1591,7 @@ static inline float exp(float x_full) {
 static inline uniform float exp(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_exp(x_full);
+        return __stdlib_expf(x_full);
    }
    else if (__math_lib == __math_lib_ispc_fast) {
        uniform float z = floor(1.44269504088896341f * x_full + 0.5f); 
@@ -1485,7 +1692,7 @@ static inline void __range_reduce_log(float input, reference float reduced, refe


 static inline void __range_reduce_log(uniform float input, reference uniform float reduced, 
-                               reference uniform int exponent) {
+                                      reference uniform int exponent) {
    uniform int int_version = intbits(input);
    static const uniform int nonexponent_mask = 0x807FFFFF;

@@ -1509,7 +1716,7 @@ static inline float log(float x_full) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_log(extract(x_full, i));
+            uniform float r = __stdlib_logf(extract(x_full, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1592,7 +1799,7 @@ static inline float log(float x_full) {
 static inline uniform float log(uniform float x_full) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_log(x_full);
+        return __stdlib_logf(x_full);
    }
    else if (__math_lib == __math_lib_ispc_fast) {
        uniform int e;
@@ -1679,7 +1886,7 @@ static inline float pow(float a, float b) {
        for (uniform int i = 0; i < programCount; ++i) {
            if ((mask & (1 << i)) == 0)
                continue;
-            uniform float r = __stdlib_pow(extract(a, i), extract(b, i));
+            uniform float r = __stdlib_powf(extract(a, i), extract(b, i));
            ret = insert(ret, i, r);
        }
        return ret;
@@ -1693,7 +1900,7 @@ static inline float pow(float a, float b) {
 static inline uniform float pow(uniform float a, uniform float b) {
    if (__math_lib == __math_lib_system ||
        __math_lib == __math_lib_svml) {
-        return __stdlib_pow(a, b);
+        return __stdlib_powf(a, b);
    }
    else if (__math_lib == __math_lib_ispc || 
             __math_lib == __math_lib_ispc_fast) {
@@ -1701,6 +1908,273 @@ static inline uniform float pow(uniform float a, uniform float b) {
    }
 }

+///////////////////////////////////////////////////////////////////////////
+// Transcendentals (double precision)
+
+static inline double sqrt(double v) {
+    return __sqrt_varying_double(v);
+}
+
+static inline uniform double sqrt(uniform double v) {
+    return __sqrt_uniform_double(v);
+}
+
+static inline double ldexp(double x, int n) {
+    unsigned int64 ex = 0x7ff0000000000000;
+    unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix = ix & ~0x7ff0000000000000;  // clear exponent
+    int64 n64 = ((int64)n << 52) + ex;
+    ix |= n64; // insert new exponent
+    return doublebits(ix);
+}
+
+static inline uniform double ldexp(uniform double x, uniform int n) {
+    uniform unsigned int64 ex = 0x7ff0000000000000;
+    uniform unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix = ix & ~0x7ff0000000000000;  // clear exponent
+    uniform int n64 = ((int64)n << 52) + ex;
+    ix |= n64; // insert new exponent
+    return doublebits(ix);
+}
+
+static inline double frexp(double x, reference int pw2) {
+    unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
+    unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7ff0000000000000;  // clear exponent
+    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
+    return doublebits(ix);
+}
+
+static inline uniform double frexp(uniform double x, reference uniform int pw2) {
+    uniform unsigned int64 ex = 0x7ff0000000000000;              // exponent mask
+    uniform unsigned int64 ix = intbits(x);
+    ex &= ix;
+    ix &= ~0x7ff0000000000000;  // clear exponent
+    pw2 = (int)(ex >> 52) - 1022; // compute exponent
+    ix |= 0x3fe0000000000000;         // insert exponent +1 in x
+    return doublebits(ix);
+}
+
+static inline double sin(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return sin((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_sin(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double sin(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return sin((float)x);
+    else
+        return __stdlib_sin(x);
+}
+
+static inline double cos(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return cos((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_cos(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double cos(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return cos((float)x);
+    else
+        return __stdlib_cos(x);
+}
+
+static inline void sincos(double x, reference double sin_result,
+                          reference double cos_result) {
+    if (__math_lib == __math_lib_ispc_fast) {
+        float sr, cr;
+        sincos((float)x, sr, cr);
+        sin_result = sr;
+        cos_result = cr;
+    }
+    else {
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            uniform double sr, cr;
+            if ((mask & (1 << i)) == 0)
+                continue;
+            __stdlib_sincos(extract(x, i), sr, cr);
+            sin_result = insert(sin_result, i, sr);
+            cos_result = insert(cos_result, i, cr);
+        }
+    }
+}
+
+static inline void sincos(uniform double x, reference uniform double sin_result,
+                          reference uniform double cos_result) {
+    if (__math_lib == __math_lib_ispc_fast) {
+        uniform float sr, cr;
+        sincos((uniform float)x, sr, cr);
+        sin_result = sr;
+        cos_result = cr;
+    }
+    else
+        __stdlib_sincos(x, sin_result, cos_result);
+}
+
+static inline double tan(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return tan((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_tan(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double tan(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return tan((float)x);
+    else
+        return __stdlib_tan(x);
+}
+
+static inline double atan(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_atan(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double atan(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan((float)x);
+    else
+        return __stdlib_atan(x);
+}
+
+static inline double atan2(double y, double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan2((float)y, (float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_atan2(extract(y, i), extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double atan2(uniform double y, uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return atan2((float)y, (float)x);
+    else
+        return __stdlib_atan2(y, x);
+}
+
+static inline double exp(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return exp((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_exp(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double exp(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return exp((float)x);
+    else
+        return __stdlib_exp(x);
+}
+
+static inline double log(double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return log((float)x);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_log(extract(x, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double log(uniform double x) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return log((float)x);
+    else
+        return __stdlib_log(x);
+}
+
+static inline double pow(double a, double b) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return pow((float)a, (float)b);
+    else {
+        double ret;
+        uniform int mask = lanemask();
+        for (uniform int i = 0; i < programCount; ++i) {
+            if ((mask & (1 << i)) == 0)
+                continue;
+            uniform double r = __stdlib_pow(extract(a, i), extract(b, i));
+            ret = insert(ret, i, r);
+        }
+        return ret;
+    }
+}
+
+static inline uniform double pow(uniform double a, uniform double b) {
+    if (__math_lib == __math_lib_ispc_fast)
+        return pow((float)a, (float)b);
+    else
+        return __stdlib_pow(a, b);
+}

 ///////////////////////////////////////////////////////////////////////////
 // RNG stuff
@@ -1709,7 +2183,7 @@ struct RNGState {
    unsigned int z1, z2, z3, z4;
 };

-static inline unsigned int random(reference uniform RNGState state)
+static inline unsigned int random(reference RNGState state)
 {
    unsigned int b;

@@ -1724,14 +2198,14 @@ static inline unsigned int random(reference uniform RNGState state)
    return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4);
 }

-static inline float frandom(reference uniform RNGState state)
+static inline float frandom(reference RNGState state)
 {
    return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24);
 }

-static inline uniform unsigned int __seed4(reference uniform RNGState state, 
-                                    uniform int start,
-                                    uniform unsigned int seed) {
+static inline uniform unsigned int __seed4(reference RNGState state, 
+                                           uniform int start,
+                                           uniform unsigned int seed) {
    uniform unsigned int c1 = 0xf0f0f0f0;
    uniform unsigned int c2 = 0x0f0f0f0f;

--- a/stdlib.m4
+++ b/stdlib.m4
@@ -136,6 +136,26 @@ define(`reduce8by4', `
 )


+;; Apply a unary function to the 4-vector in %0, return the vector result.
+;; $1: scalar type of result
+;; $2: name of scalar function to call
+
+define(`unary1to4', `
+  %v_0 = extractelement <4 x $1> %0, i32 0
+  %r_0 = call $1 $2($1 %v_0)
+  %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0
+  %v_1 = extractelement <4 x $1> %0, i32 1
+  %r_1 = call $1 $2($1 %v_1)
+  %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1
+  %v_2 = extractelement <4 x $1> %0, i32 2
+  %r_2 = call $1 $2($1 %v_2)
+  %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2
+  %v_3 = extractelement <4 x $1> %0, i32 3
+  %r_3 = call $1 $2($1 %v_3)
+  %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3
+  ret <4 x $1> %ret_3
+')
+
 ;; Given a unary function that takes a 2-wide vector and a 4-wide vector
 ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide
 ;; vector, apply it, and return the corresponding 4-wide vector result
@@ -286,6 +306,49 @@ ret <8 x float> %ret
 '
 )

+define(`round4to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2)
+%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2)
+%ret = shufflevector <4 x double> %r0, <4 x double> %r1, 
+         <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
+; and similarly for doubles...
+
+define(`round2to4double', `
+%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%ret = shufflevector <2 x double> %r0, <2 x double> %r1, 
+         <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ret <4 x double> %ret
+'
+)
+
+define(`round2to8double', `
+%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2)
+%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2)
+%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2)
+%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2)
+%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, 
+          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1,
+          <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ret <8 x double> %ret
+'
+)
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; forloop macro

@@ -503,15 +566,26 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>)
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; vector ops

-define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline {
-  %extract = extractelement <$1 x float> %0, i32 %1
-  ret float %extract
+define internal i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i32> %0, i32 %1
+  ret i32 %extract
 }

-define internal <$1 x float> @__insert(<$1 x float>, i32, 
-                                       float) nounwind readnone alwaysinline {
-  %insert = insertelement <$1 x float> %0, float %2, i32 %1
-  ret <$1 x float> %insert
+define internal <$1 x i32> @__insert_int32(<$1 x i32>, i32, 
+                                           i32) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1
+  ret <$1 x i32> %insert
+}
+
+define internal i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline {
+  %extract = extractelement <$1 x i64> %0, i32 %1
+  ret i64 %extract
+}
+
+define internal <$1 x i64> @__insert_int64(<$1 x i64>, i32, 
+                                           i64) nounwind readnone alwaysinline {
+  %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1
+  ret <$1 x i64> %insert
 }

 shuffles($1, float, float, 4)
@@ -588,51 +662,106 @@ declare float @expf(float) nounwind readnone
 declare float @logf(float) nounwind readnone
 declare float @powf(float, float) nounwind readnone

-define internal float @__stdlib_sin(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_sinf(float) nounwind readnone alwaysinline {
  %r = call float @sinf(float %0)
  ret float %r
 }

-define internal float @__stdlib_cos(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_cosf(float) nounwind readnone alwaysinline {
  %r = call float @cosf(float %0)
  ret float %r
 }

-define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline {
+define internal void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline {
  call void @sincosf(float %0, float *%1, float *%2)
  ret void
 }

-define internal float @__stdlib_tan(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_tanf(float) nounwind readnone alwaysinline {
  %r = call float @tanf(float %0)
  ret float %r
 }

-define internal float @__stdlib_atan(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_atanf(float) nounwind readnone alwaysinline {
  %r = call float @atanf(float %0)
  ret float %r
 }

-define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline {
+define internal float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline {
  %r = call float @atan2f(float %0, float %1)
  ret float %r
 }

-define internal float @__stdlib_log(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_logf(float) nounwind readnone alwaysinline {
  %r = call float @logf(float %0)
  ret float %r
 }

-define internal float @__stdlib_exp(float) nounwind readnone alwaysinline {
+define internal float @__stdlib_expf(float) nounwind readnone alwaysinline {
  %r = call float @expf(float %0)
  ret float %r
 }

-define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline {
+define internal float @__stdlib_powf(float, float) nounwind readnone alwaysinline {
  %r = call float @powf(float %0, float %1)
  ret float %r
 }

+declare double @sin(double) nounwind readnone
+declare double @cos(double) nounwind readnone
+declare void @sincos(double, double *, double *) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @atan2(double, double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @pow(double, double) nounwind readnone
+
+define internal double @__stdlib_sin(double) nounwind readnone alwaysinline {
+  %r = call double @sin(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_cos(double) nounwind readnone alwaysinline {
+  %r = call double @cos(double %0)
+  ret double %r
+}
+
+define internal void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline {
+  call void @sincos(double %0, double *%1, double *%2)
+  ret void
+}
+
+define internal double @__stdlib_tan(double) nounwind readnone alwaysinline {
+  %r = call double @tan(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_atan(double) nounwind readnone alwaysinline {
+  %r = call double @atan(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_atan2(double, double) nounwind readnone alwaysinline {
+  %r = call double @atan2(double %0, double %1)
+  ret double %r
+}
+
+define internal double @__stdlib_log(double) nounwind readnone alwaysinline {
+  %r = call double @log(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_exp(double) nounwind readnone alwaysinline {
+  %r = call double @exp(double %0)
+  ret double %r
+}
+
+define internal double @__stdlib_pow(double, double) nounwind readnone alwaysinline {
+  %r = call double @pow(double %0, double %1)
+  ret double %r
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; atomics and memory barriers

@@ -676,6 +805,52 @@ global_atomic_exchange($1, i64, int64)

 ')

+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; 64-bit integer min and max functions
+
+;; utility function used by int64minmax below.  This shouldn't be called by
+;; target .ll files directly.
+;; $1: target vector width
+;; $2: {min,max} (used in constructing function names)
+;; $3: {int64,uint64} (used in constructing function names)
+;; $4: {slt,sgt} comparison operator to used
+
+define(`i64minmax', `
+define internal i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone {
+  %c = icmp $4 i64 %0, %1
+  %r = select i1 %c, i64 %0, i64 %1
+  ret i64 %r
+}
+
+define internal <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone {
+  %rptr = alloca <$1 x i64>
+  %r64ptr = bitcast <$1 x i64> * %rptr to i64 *
+
+  forloop(i, 0, eval($1-1), `
+  %v0_`'i = extractelement <$1 x i64> %0, i32 i
+  %v1_`'i = extractelement <$1 x i64> %1, i32 i
+  %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i
+  %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i
+  %ptr_`'i = getelementptr i64 * %r64ptr, i32 i
+  store i64 %v_`'i, i64 * %ptr_`'i
+')                  
+
+  %ret = load <$1 x i64> * %rptr
+  ret <$1 x i64> %ret
+}
+')
+
+;; this is the function that target .ll files should call; it just takes the target
+;; vector width as a parameter
+
+define(`int64minmax', `
+i64minmax($1,min,int64,slt)
+i64minmax($1,max,int64,sgt)
+i64minmax($1,min,uint64,ult)
+i64minmax($1,max,uint64,ugt)
+')
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Definitions of 8 and 16-bit load and store functions
 ;;
--- a/tests/double-abs-1.ispc
+++ b/tests/double-abs-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex]; 
+    RET[programIndex] = abs(-a);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/double-abs.ispc
+++ b/tests/double-abs.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex]; 
+    RET[programIndex] = abs(a);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/double-max-1.ispc
+++ b/tests/double-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = -2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 * (1 + programIndex);
+}
--- a/tests/double-max.ispc
+++ b/tests/double-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
--- a/tests/double-min-1.ispc
+++ b/tests/double-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = -2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -2 * (1 + programIndex);
+}
--- a/tests/double-min.ispc
+++ b/tests/double-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    double b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/double-sqrt.ispc
+++ b/tests/double-sqrt.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = aFOO[programIndex];
+    if (programIndex & 1) {
+        a *= a;
+        RET[programIndex] = sqrt(a);
+    }
+    else
+        RET[programIndex] = a;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/extract-1.ispc
+++ b/tests/extract-1.ispc
@@ -0,0 +1,11 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = programIndex;
+    RET[programIndex] = extract(a, 3); 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 3;
+}
--- a/tests/frexp-double-1.ispc
+++ b/tests/frexp-double-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    frexp(a, exponent);
+    RET[programIndex] = exponent;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
--- a/tests/frexp-double.ispc
+++ b/tests/frexp-double.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    RET[programIndex] = frexp(a, exponent);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.75;
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
--- a/tests/frexp-float-1.ispc
+++ b/tests/frexp-float-1.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    frexp(a, exponent);
+    RET[programIndex] = exponent;
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1+programIndex;
+}
--- a/tests/frexp-float.ispc
+++ b/tests/frexp-float.ispc
@@ -0,0 +1,17 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = (1<<programIndex) * 1.5;
+    if (programIndex & 1)
+        a = -a;
+    int exponent;
+    RET[programIndex] = frexp(a, exponent);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 0.75;
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
--- a/tests/insert-1.ispc
+++ b/tests/insert-1.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a;
+    for (uniform int i = 0; i < programCount; ++i)
+        a = insert(a, i, (int64)i);
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex;
+}
--- a/tests/insert-2.ispc
+++ b/tests/insert-2.ispc
@@ -0,0 +1,13 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a;
+    for (uniform int i = 0; i < programCount; ++i)
+        a = insert(a, i, (double)i+10);
+    RET[programIndex] = a; 
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = programIndex+10;
+}
--- a/tests/int64-max-1.ispc
+++ b/tests/int64-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = -2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 * (1 + programIndex);
+}
--- a/tests/int64-max.ispc
+++ b/tests/int64-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
--- a/tests/int64-min-1.ispc
+++ b/tests/int64-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = -2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = -2 * (1 + programIndex);
+}
--- a/tests/int64-min.ispc
+++ b/tests/int64-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    int64 b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}
--- a/tests/ldexp-double.ispc
+++ b/tests/ldexp-double.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    double a = 1 << (programIndex);
+    if (programIndex & 1)
+        a = -a;
+    RET[programIndex] = ldexp(a, 2);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 << (programIndex + 2));
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
--- a/tests/ldexp-float.ispc
+++ b/tests/ldexp-float.ispc
@@ -0,0 +1,16 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    float a = 1 << (programIndex);
+    if (programIndex & 1)
+        a = -a;
+    RET[programIndex] = ldexp(a, 2);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 << (programIndex + 2));
+    if (programIndex & 1)
+        RET[programIndex] = -RET[programIndex];
+}
--- a/tests/popcnt-4.ispc
+++ b/tests/popcnt-4.ispc
@@ -0,0 +1,20 @@
+
+export uniform int width() { return programCount; }
+
+
+export void f_f(uniform float RET[4], uniform float aFOO[]) {
+    int64 a = aFOO[programIndex];
+    a = (a < 3) ? 1 : 0;
+    if ((programIndex & 1) != 0)
+        a |= ((int64)1 << 36);
+    RET[programIndex] = popcnt(a);
+}
+
+export void result(uniform float RET[]) { 
+    RET[programIndex] = 0;
+    if (programIndex & 1)
+        ++RET[programIndex];
+    if (programIndex < 2)
+        ++RET[programIndex];
+}
+
--- a/tests/reduce-add-double-1.ispc
+++ b/tests/reduce-add-double-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    if (iv & 1)
+        m = reduce_add((double)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
--- a/tests/reduce-add-double-2.ispc
+++ b/tests/reduce-add-double-2.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int iv = (int)v;
+    m = reduce_add((double)iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 124;
+    RET[programIndex] = x;
+}
+
--- a/tests/reduce-add-double.ispc
+++ b/tests/reduce-add-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_add(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
--- a/tests/reduce-add-int64-1.ispc
+++ b/tests/reduce-add-int64-1.ispc
@@ -0,0 +1,22 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int64 iv = (int64)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
+
--- a/tests/reduce-add-int64.ispc
+++ b/tests/reduce-add-int64.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    int64 iv = (int)v;
+    m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 10;
+    else if (programCount == 8) x = 36;
+    else if (programCount == 16) x = 136;
+    RET[programIndex] = x;
+}
+
--- a/tests/reduce-add-uint64-1.ispc
+++ b/tests/reduce-add-uint64-1.ispc
@@ -0,0 +1,21 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    unsigned int64 iv = (unsigned int)v;
+    if (iv & 1)
+        m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { 
+    uniform int x = -1234;
+    if (programCount == 4) x = 4;
+    else if (programCount == 8) x = 16;
+    else if (programCount == 16) x = 64;
+    RET[programIndex] = x;
+}
--- a/tests/reduce-add-uint64.ispc
+++ b/tests/reduce-add-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex & 0x3];
+    uniform float m;
+    unsigned int64 iv = (unsigned int)v;
+    m = reduce_add(iv);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 10 * programCount/4; }
--- a/tests/reduce-max-double.ispc
+++ b/tests/reduce-max-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
--- a/tests/reduce-max-int64.ispc
+++ b/tests/reduce-max-int64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_max(-(int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -3; }
--- a/tests/reduce-max-uint64.ispc
+++ b/tests/reduce-max-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v < 3)
+        m = reduce_max((unsigned int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 2; }
--- a/tests/reduce-min-double.ispc
+++ b/tests/reduce-min-double.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    double v = aFOO[programIndex];
+    uniform float m;
+    if (v > 0 && v < 3)
+        m = reduce_min(-v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -2; }
--- a/tests/reduce-min-int.ispc
+++ b/tests/reduce-min-int.ispc
@@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
    float v = aFOO[programIndex];
    uniform float m;
    if (v >= 0 && v < 2)
-        m = reduce_min(-v);
+        m = reduce_min(-(int)v);
    RET[programIndex] = m;
 }

--- a/tests/reduce-min-int64.ispc
+++ b/tests/reduce-min-int64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 0 && v < 2)
+        m = reduce_min(-(int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = -1; }
--- a/tests/reduce-min-uint64.ispc
+++ b/tests/reduce-min-uint64.ispc
@@ -0,0 +1,14 @@
+
+export uniform int width() { return programCount; }
+
+
+
+export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
+    float v = aFOO[programIndex];
+    uniform float m;
+    if (v >= 3)
+        m = reduce_min((unsigned int64)v);
+    RET[programIndex] = m;
+}
+
+export void result(uniform float RET[]) { RET[programIndex] = 3; }
--- a/tests/uint64-max-1.ispc
+++ b/tests/uint64-max-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = max(2, 1 + programIndex);
+}
--- a/tests/uint64-max.ispc
+++ b/tests/uint64-max.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2. * a;
+    RET[programIndex] = max(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 2 * (1 + programIndex);
+}
--- a/tests/uint64-min-1.ispc
+++ b/tests/uint64-min-1.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2 * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = (1 + programIndex);
+}
--- a/tests/uint64-min.ispc
+++ b/tests/uint64-min.ispc
@@ -0,0 +1,12 @@
+
+export uniform int width() { return programCount; }
+
+export void f_f(uniform float RET[], uniform float aFOO[]) {
+    unsigned int64 a = aFOO[programIndex];
+    unsigned int64 b = 2. * a;
+    RET[programIndex] = min(a,b);
+}
+
+export void result(uniform float RET[]) {
+    RET[programIndex] = 1 + programIndex;
+}