diff --git a/builtins.cpp b/builtins.cpp index ef8eea0d..6583f7dd 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -104,6 +104,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { AtomicType::UniformInt64, false); else if (t == LLVMTypes::FloatPointerType) return new ReferenceType(AtomicType::UniformFloat, false); + else if (t == LLVMTypes::DoublePointerType) + return new ReferenceType(AtomicType::UniformDouble, false); else if (t == LLVMTypes::Int32VectorPointerType) return new ReferenceType(intAsUnsigned ? AtomicType::VaryingUInt32 : AtomicType::VaryingInt32, false); @@ -112,6 +114,8 @@ lLLVMTypeToISPCType(const llvm::Type *t, bool intAsUnsigned) { AtomicType::VaryingInt64, false); else if (t == LLVMTypes::FloatVectorPointerType) return new ReferenceType(AtomicType::VaryingFloat, false); + else if (t == LLVMTypes::DoubleVectorPointerType) + return new ReferenceType(AtomicType::VaryingDouble, false); else if (llvm::isa(t)) { const llvm::PointerType *pt = llvm::dyn_cast(t); diff --git a/llvmutil.cpp b/llvmutil.cpp index 1f64e06b..755c375f 100644 --- a/llvmutil.cpp +++ b/llvmutil.cpp @@ -50,6 +50,7 @@ const llvm::Type *LLVMTypes::Int64PointerType = NULL; const llvm::Type *LLVMTypes::FloatType = NULL; const llvm::Type *LLVMTypes::FloatPointerType = NULL; const llvm::Type *LLVMTypes::DoubleType = NULL; +const llvm::Type *LLVMTypes::DoublePointerType = NULL; const llvm::VectorType *LLVMTypes::MaskType = NULL; const llvm::VectorType *LLVMTypes::BoolVectorType = NULL; @@ -61,6 +62,7 @@ const llvm::Type *LLVMTypes::Int64VectorPointerType = NULL; const llvm::VectorType *LLVMTypes::FloatVectorType = NULL; const llvm::Type *LLVMTypes::FloatVectorPointerType = NULL; const llvm::VectorType *LLVMTypes::DoubleVectorType = NULL; +const llvm::Type *LLVMTypes::DoubleVectorPointerType = NULL; const llvm::ArrayType *LLVMTypes::VoidPointerVectorType = NULL; llvm::Constant *LLVMTrue = NULL; @@ -83,6 +85,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { LLVMTypes::FloatType = llvm::Type::getFloatTy(*ctx); LLVMTypes::FloatPointerType = llvm::PointerType::get(LLVMTypes::FloatType, 0); LLVMTypes::DoubleType = llvm::Type::getDoubleTy(*ctx); + LLVMTypes::DoublePointerType = llvm::PointerType::get(LLVMTypes::DoubleType, 0); // Note that both the mask and bool vectors are vector of int32s // (not i1s). LLVM ends up generating much better SSE code with @@ -103,6 +106,7 @@ InitLLVMUtil(llvm::LLVMContext *ctx, Target target) { LLVMTypes::FloatVectorPointerType = llvm::PointerType::get(LLVMTypes::FloatVectorType, 0); LLVMTypes::DoubleVectorType = llvm::VectorType::get(LLVMTypes::DoubleType, target.vectorWidth); + LLVMTypes::DoubleVectorPointerType = llvm::PointerType::get(LLVMTypes::DoubleVectorType, 0); LLVMTypes::VoidPointerVectorType = llvm::ArrayType::get(LLVMTypes::VoidPointerType, target.vectorWidth); diff --git a/llvmutil.h b/llvmutil.h index 3a5a4e4c..1a26ae7e 100644 --- a/llvmutil.h +++ b/llvmutil.h @@ -61,6 +61,7 @@ struct LLVMTypes { static const llvm::Type *FloatType; static const llvm::Type *FloatPointerType; static const llvm::Type *DoubleType; + static const llvm::Type *DoublePointerType; static const llvm::VectorType *MaskType; static const llvm::VectorType *BoolVectorType; @@ -72,6 +73,7 @@ struct LLVMTypes { static const llvm::VectorType *FloatVectorType; static const llvm::Type *FloatVectorPointerType; static const llvm::VectorType *DoubleVectorType; + static const llvm::Type *DoubleVectorPointerType; static const llvm::ArrayType *VoidPointerVectorType; }; diff --git a/stdlib-avx.ll b/stdlib-avx.ll index 9bf223da..947248dc 100644 --- a/stdlib-avx.ll +++ b/stdlib-avx.ll @@ -42,6 +42,7 @@ stdlib_core(8) packed_load_and_store(8) int8_16(8) +int64minmax(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp @@ -77,7 +78,7 @@ define internal float @__rcp_uniform_float(float) nounwind readonly alwaysinline } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; rounding +;; rounding floats declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone @@ -141,6 +142,56 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin ret float %rs } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round4to8double(%0, 8) +} + +define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { + %xi = insertelement <2 x double> undef, double %0, i32 0 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + round4to8double(%0, 9) +} + +define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + round4to8double(%0, 10) +} + +define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rsqrt + declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone @@ -318,11 +369,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli declare i32 @llvm.ctpop.i32(i32) nounwind readnone -define internal i32 @__popcnt(i32) nounwind readonly alwaysinline { +define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { %call = call i32 @llvm.ctpop.i32(i32 %0) ret i32 %call } +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone define internal i32 @__movmsk(<8 x i32>) nounwind readnone alwaysinline { @@ -403,6 +461,81 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal double ops + +declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone + +define internal double @__reduce_add_double(<8 x double>) nounwind readonly alwaysinline { + %v0 = shufflevector <8 x double> %0, <8 x double> undef, + <4 x i32> + %v1 = shufflevector <8 x double> %0, <8 x double> undef, + <4 x i32> + %sum0 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %v0, <4 x double> %v1) + %sum1 = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %sum0, <4 x double> %sum0) + %scalar1 = extractelement <4 x double> %sum0, i32 0 + %scalar2 = extractelement <4 x double> %sum1, i32 1 + %sum = fadd double %scalar1, %scalar2 + ret double %sum +} + +define internal double @__reduce_min_double(<8 x double>) nounwind readnone alwaysinline { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + + +define internal double @__reduce_max_double(<8 x double>) nounwind readnone alwaysinline { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; horizontal int64 ops + +define internal <8 x i64> @__add_varying_int64(<8 x i64>, + <8 x i64>) nounwind readnone alwaysinline { + %s = add <8 x i64> %0, %1 + ret <8 x i64> %s +} + +define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %s = add i64 %0, %1 + ret i64 %s +} + +define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone alwaysinline { + reduce8(i64, @__add_varying_int64, @__add_uniform_int64) +} + + +define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone alwaysinline { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + + +define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; horizontal uint64 ops + +define internal i64 @__reduce_add_uint64(<8 x i64> %v) nounwind readnone alwaysinline { + %r = call i64 @__reduce_add_int64(<8 x i64> %v) + ret i64 %r +} + +define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + + +define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone alwaysinline { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts diff --git a/stdlib-sse.ll b/stdlib-sse.ll index cb9ee295..77d52873 100644 --- a/stdlib-sse.ll +++ b/stdlib-sse.ll @@ -37,6 +37,7 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; int8_16(4) +int64minmax(4) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp @@ -227,6 +228,54 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway ret float %ret } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision sqrt + +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { + unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) + ret <4 x double> %ret +} + + +define internal double @__sqrt_uniform_double(double) nounwind alwaysinline { + sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0) + ret double %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double precision min/max + +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) + ret <4 x double> %ret +} + + +define internal double @__min_uniform_double(double, double) nounwind readnone { + sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) + ret double %ret +} + + +define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone { + binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) + ret <4 x double> %ret +} + + +define internal double @__max_uniform_double(double, double) nounwind readnone { + sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) + ret double %ret +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions @@ -279,6 +328,55 @@ define internal i32 @__reduce_max_uint32(<4 x i32>) nounwind readnone { } +define internal double @__reduce_add_double(<4 x double>) nounwind readnone { + %v0 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %v1 = shufflevector <4 x double> %0, <4 x double> undef, + <2 x i32> + %sum = fadd <2 x double> %v0, %v1 + %e0 = extractelement <2 x double> %sum, i32 0 + %e1 = extractelement <2 x double> %sum, i32 1 + %m = fadd double %e0, %e1 + ret double %m +} + +define internal double @__reduce_min_double(<4 x double>) nounwind readnone { + reduce4(double, @__min_varying_double, @__min_uniform_double) +} + +define internal double @__reduce_max_double(<4 x double>) nounwind readnone { + reduce4(double, @__max_varying_double, @__max_uniform_double) +} + +define internal i64 @__reduce_add_int64(<4 x i64>) nounwind readnone { + %v0 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %v1 = shufflevector <4 x i64> %0, <4 x i64> undef, + <2 x i32> + %sum = add <2 x i64> %v0, %v1 + %e0 = extractelement <2 x i64> %sum, i32 0 + %e1 = extractelement <2 x i64> %sum, i32 1 + %m = add i64 %e0, %e1 + ret i64 %m +} + +define internal i64 @__reduce_min_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define internal i64 @__reduce_max_int64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define internal i64 @__reduce_min_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define internal i64 @__reduce_max_uint64(<4 x i64>) nounwind readnone { + reduce4(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store @@ -389,53 +487,3 @@ gen_gather(4, i32) gen_gather(4, i64) gen_scatter(4, i32) gen_scatter(4, i64) - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; double precision sqrt - -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone - -define internal <4 x double> @__sqrt_varying_double(<4 x double>) nounwind alwaysinline { - unary2to4(ret, double, @llvm.x86.sse2.sqrt.pd, %0) - ret <4 x double> %ret -} - - -define internal double @__sqrt_uniform_double(double) nounwind alwaysinline { - sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0) - ret double %ret -} - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; double precision min/max - -declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone -declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone - -define internal <4 x double> @__min_varying_double(<4 x double>, <4 x double>) nounwind readnone { - binary2to4(ret, double, @llvm.x86.sse2.min.pd, %0, %1) - ret <4 x double> %ret -} - - -define internal double @__min_uniform_double(double, double) nounwind readnone { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) - ret double %ret -} - - -define internal <4 x double> @__max_varying_double(<4 x double>, <4 x double>) nounwind readnone { - binary2to4(ret, double, @llvm.x86.sse2.max.pd, %0, %1) - ret <4 x double> %ret -} - - -define internal double @__max_uniform_double(double, double) nounwind readnone { - sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) - ret double %ret -} diff --git a/stdlib-sse2.ll b/stdlib-sse2.ll index d1573e21..99711181 100644 --- a/stdlib-sse2.ll +++ b/stdlib-sse2.ll @@ -152,6 +152,40 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin ret float %binop.i } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare double @round(double) +declare double @floor(double) +declare double @ceil(double) + +define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { + unary1to4(double, @round) +} + +define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @round(double %0) + ret double %r +} + +define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { + unary1to4(double, @floor) +} + +define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @floor(double %0) + ret double %r +} + +define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { + unary1to4(double, @ceil) +} + +define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + %r = call double @ceil(double %0) + ret double %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; min/max @@ -252,7 +286,7 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli ; it does generate non-POPCNT code and in particular better code than ; the below does.) -define internal i32 @__popcnt(i32) nounwind readonly alwaysinline { +define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { entry: br label %loop @@ -269,6 +303,16 @@ exit: ret i32 %newcount } +define internal i32 @__popcnt_int64(i64) nounwind readnone alwaysinline { + %vec = bitcast i64 %0 to <2 x i32> + %v0 = extractelement <2 x i32> %vec, i32 0 + %v1 = extractelement <2 x i32> %vec, i32 1 + %c0 = call i32 @__popcnt_int32(i32 %v0) + %c1 = call i32 @__popcnt_int32(i32 %v1) + %sum = add i32 %c0, %c1 + ret i32 %sum +} + define internal float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline { %v1 = shufflevector <4 x float> %v, <4 x float> undef, diff --git a/stdlib-sse4.ll b/stdlib-sse4.ll index 30b6f43b..e0fcec4a 100644 --- a/stdlib-sse4.ll +++ b/stdlib-sse4.ll @@ -40,7 +40,7 @@ packed_load_and_store(4) include(`stdlib-sse.ll') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; math +;; rounding floats declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone @@ -106,7 +106,52 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; integer min/max +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define internal <4 x double> @__round_varying_double(<4 x double>) nounwind readonly alwaysinline { + round2to4double(%0, 8) +} + +define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { + %xi = insertelement <2 x double> undef, double %0, i32 0 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <4 x double> @__floor_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + round2to4double(%0, 9) +} + +define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <4 x double> @__ceil_varying_double(<4 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + round2to4double(%0, 10) +} + +define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 min/max declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone @@ -163,11 +208,18 @@ define internal i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinli declare i32 @llvm.ctpop.i32(i32) nounwind readnone -define internal i32 @__popcnt(i32) nounwind readonly alwaysinline { +define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { %call = call i32 @llvm.ctpop.i32(i32 %0) ret i32 %call } +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define internal float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline { diff --git a/stdlib-sse4x2.ll b/stdlib-sse4x2.ll index 009c1c5b..83baaecd 100644 --- a/stdlib-sse4x2.ll +++ b/stdlib-sse4x2.ll @@ -39,6 +39,7 @@ stdlib_core(8) packed_load_and_store(8) int8_16(8) +int64minmax(8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp @@ -258,7 +259,7 @@ define internal float @__min_uniform_float(float, float) nounwind readonly alway } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; int min/max +;; int32 min/max declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone @@ -380,6 +381,60 @@ define internal i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinli reduce8by4(i32, @llvm.x86.sse41.pmaxud, @__max_uniform_uint32) } +define internal <4 x double> @__add_varying_double(<4 x double>, + <4 x double>) nounwind readnone alwaysinline { + %r = fadd <4 x double> %0, %1 + ret <4 x double> %r +} + +define internal double @__add_uniform_double(double, double) nounwind readnone alwaysinline { + %r = fadd double %0, %1 + ret double %r +} + +define internal double @__reduce_add_double(<8 x double>) nounwind readnone { + reduce8by4(double, @__add_varying_double, @__add_uniform_double) +} + +define internal double @__reduce_min_double(<8 x double>) nounwind readnone { + reduce8(double, @__min_varying_double, @__min_uniform_double) +} + +define internal double @__reduce_max_double(<8 x double>) nounwind readnone { + reduce8(double, @__max_varying_double, @__max_uniform_double) +} + +define internal <4 x i64> @__add_varying_int64(<4 x i64>, + <4 x i64>) nounwind readnone alwaysinline { + %r = add <4 x i64> %0, %1 + ret <4 x i64> %r +} + +define internal i64 @__add_uniform_int64(i64, i64) nounwind readnone alwaysinline { + %r = add i64 %0, %1 + ret i64 %r +} + +define internal i64 @__reduce_add_int64(<8 x i64>) nounwind readnone { + reduce8by4(i64, @__add_varying_int64, @__add_uniform_int64) +} + +define internal i64 @__reduce_min_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_int64, @__min_uniform_int64) +} + +define internal i64 @__reduce_max_int64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_int64, @__max_uniform_int64) +} + +define internal i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64) +} + +define internal i64 @__reduce_max_uint64(<8 x i64>) nounwind readnone { + reduce8(i64, @__max_varying_uint64, @__max_uniform_uint64) +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store @@ -465,7 +520,7 @@ gen_scatter(8, i32) gen_scatter(8, i64) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; math +;; float rounding declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone @@ -526,16 +581,68 @@ define internal float @__ceil_uniform_float(float) nounwind readonly alwaysinlin ret float %rs } +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rounding doubles + +declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone +declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone + +define internal <8 x double> @__round_varying_double(<8 x double>) nounwind readonly alwaysinline { + round2to8double(%0, 8) +} + +define internal double @__round_uniform_double(double) nounwind readonly alwaysinline { + %xi = insertelement <2 x double> undef, double %0, i32 0 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 8) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <8 x double> @__floor_varying_double(<8 x double>) nounwind readonly alwaysinline { + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + round2to8double(%0, 9) +} + +define internal double @__floor_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundpd, round down 0b01 | don't signal precision exceptions 0b1000 = 9 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + +define internal <8 x double> @__ceil_varying_double(<8 x double>) nounwind readonly alwaysinline { + ; roundpd, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + round2to8double(%0, 10) +} + +define internal double @__ceil_uniform_double(double) nounwind readonly alwaysinline { + ; see above for round_ss instrinsic discussion... + %xi = insertelement <2 x double> undef, double %0, i32 0 + ; roundps, round up 0b10 | don't signal precision exceptions 0b1000 = 10 + %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) + %rs = extractelement <2 x double> %xr, i32 0 + ret double %rs +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; horizontal ops / reductions declare i32 @llvm.ctpop.i32(i32) nounwind readnone -define internal i32 @__popcnt(i32) nounwind readonly alwaysinline { +define internal i32 @__popcnt_int32(i32) nounwind readonly alwaysinline { %call = call i32 @llvm.ctpop.i32(i32 %0) ret i32 %call } +declare i64 @llvm.ctpop.i64(i64) nounwind readnone + +define internal i64 @__popcnt_int64(i64) nounwind readonly alwaysinline { + %call = call i64 @llvm.ctpop.i64(i64 %0) + ret i64 %call +} + declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone define internal float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline { diff --git a/stdlib.ispc b/stdlib.ispc index 432d7528..10fd828b 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -147,30 +147,57 @@ static inline int64 shuffle(int64 v0, int64 v1, int i) { // x[i] static inline uniform float extract(float x, uniform int i) { - return __extract(x, i); + return floatbits(__extract_int32((int)intbits(x), i)); +} + +static inline uniform int extract(int x, uniform int i) { + return __extract_int32(x, i); +} + +static inline uniform unsigned int extract(unsigned int x, uniform int i) { + return __extract_int32(x, (unsigned int)i); +} + +static inline uniform double extract(double x, uniform int i) { + return doublebits(__extract_int64((int64)intbits(x), i)); +} + +static inline uniform int64 extract(int64 x, uniform int i) { + return __extract_int64(x, i); +} + +static inline uniform unsigned int64 extract(unsigned int64 x, uniform int i) { + return __extract_int64(x, (unsigned int)i); } // x[i] = v static inline float insert(float x, uniform int i, uniform float v) { - return __insert(x, i, v); -} - -static inline uniform int extract(int x, uniform int i) { - return intbits(extract(floatbits(x), i)); + return floatbits(__insert_int32((int)intbits(x), i, (int)intbits(v))); } static inline int insert(int x, uniform int i, uniform int v) { - return intbits(insert(floatbits(x), i, floatbits(v))); + return __insert_int32(x, i, v); } -static inline uniform unsigned int extract(unsigned int x, uniform int i) { - return intbits(extract(floatbits(x), i)); +static inline unsigned int insert(unsigned int x, uniform int i, + uniform unsigned int v) { + return __insert_int32(x, (unsigned int)i, v); } -static inline unsigned int insert(unsigned int x, uniform int i, uniform unsigned int v) { - return intbits(insert(floatbits(x), i, floatbits(v))); +static inline double insert(double x, uniform int i, uniform double v) { + return doublebits(__insert_int64((int64)intbits(x), i, (int64)intbits(v))); } +static inline int64 insert(int64 x, uniform int i, uniform int64 v) { + return __insert_int64(x, i, v); +} + +static inline unsigned int64 insert(unsigned int64 x, uniform int i, + uniform unsigned int64 v) { + return __insert_int64(x, (unsigned int)i, v); +} + + static inline uniform bool any(bool v) { // We only care about whether "any" is true for the active program instances, // so we have to make v with the current program mask. @@ -185,20 +212,30 @@ static inline uniform bool all(bool v) { } static inline uniform int popcnt(uniform int v) { - return __popcnt(v); + return __popcnt_int32(v); +} + +static inline uniform int popcnt(uniform int64 v) { + return (int32)__popcnt_int64(v); } static inline int popcnt(int v) { int r; - uniform int i; - for (i = 0; i < programCount; ++i) + for (uniform int i = 0; i < programCount; ++i) + r = insert(r, i, popcnt(extract(v, i))); + return (r & __mask); +} + +static inline int popcnt(int64 v) { + int r; + for (uniform int i = 0; i < programCount; ++i) r = insert(r, i, popcnt(extract(v, i))); return (r & __mask); } static inline uniform int popcnt(bool v) { // As with any() and all(), only count across the active lanes - return __popcnt(__movmsk(v & __mask)); + return __popcnt_int32(__movmsk(v & __mask)); } static inline uniform int lanemask() { @@ -270,6 +307,64 @@ static inline uniform unsigned int reduce_max(unsigned int v) { return __reduce_max_uint32(__mask ? v : 0); } + +static inline uniform double reduce_add(double x) { + // zero the lanes where the mask is off + return __reduce_add_double(__mask ? x : 0.); +} + +static inline uniform double reduce_min(double v) { + int64 iflt_max = 0x7ff0000000000000; // infinity + // Must use __doublebits_varying_int64, not doublebits(), since with the + // latter the current mask enters into the returned result... + return __reduce_min_double(__mask ? v : __doublebits_varying_int64(iflt_max)); +} + +static inline uniform double reduce_max(double v) { + const uniform int64 iflt_neg_max = 0xfff0000000000000; // -infinity + // Must use __doublebits_varying_int64, not doublebits(), since with the + // latter the current mask enters into the returned result... + return __reduce_max_double(__mask ? v : __doublebits_varying_int64(iflt_neg_max)); +} + +static inline uniform int64 reduce_add(int64 x) { + // Zero out the values for lanes that aren't running + return __reduce_add_int64(x & (int64)__mask); +} + +static inline uniform int64 reduce_min(int64 v) { + // Set values for non-running lanes to the maximum integer value so + // they don't affect the result. + int64 int_max = 0x7fffffffffffffff; + return __reduce_min_int64(__mask ? v : int_max); +} + +static inline uniform int64 reduce_max(int64 v) { + // Set values for non-running lanes to the minimum integer value so + // they don't affect the result. + int64 int_min = 0x8000000000000000; + return __reduce_max_int64(__mask ? v : int_min); +} + +static inline uniform unsigned int64 reduce_add(unsigned int64 x) { + // Set values for non-running lanes to zero so they don't affect the + // result. + return __reduce_add_int64(x & (int64)__mask); +} + +static inline uniform unsigned int64 reduce_min(unsigned int64 v) { + // Set values for non-running lanes to the maximum unsigned integer + // value so they don't affect the result. + unsigned int64 uint_max = 0xffffffffffffffff; + return __reduce_min_uint64(__mask ? v : uint_max); +} + +static inline uniform unsigned int64 reduce_max(unsigned int64 v) { + // Set values for non-running lanes to zero so they don't affect the + // result. + return __reduce_max_uint64(__mask ? v : 0); +} + /////////////////////////////////////////////////////////////////////////// // packed load, store @@ -425,14 +520,37 @@ static inline uniform float abs(uniform float a) { return floatbits(i); } +static inline double abs(double a) { + // zeroing the high bit clears the sign + unsigned int64 i = intbits(a); + i &= 0x7fffffffffffffff; + return doublebits(i); +} + +static inline uniform double abs(uniform double a) { + uniform unsigned int64 i = intbits(a); + i &= 0x7fffffffffffffff; + return doublebits(i); +} + static inline unsigned int signbits(float x) { unsigned int i = intbits(x); - return (i & 0x80000000u); + return (i & 0x80000000); } static inline uniform unsigned int signbits(uniform float x) { uniform unsigned int i = intbits(x); - return (i & 0x80000000u); + return (i & 0x80000000); +} + +static inline unsigned int64 signbits(double x) { + unsigned int64 i = intbits(x); + return (i & 0x8000000000000000); +} + +static inline uniform unsigned int64 signbits(uniform double x) { + uniform unsigned int64 i = intbits(x); + return (i & 0x8000000000000000); } static inline float round(float x) { @@ -443,6 +561,14 @@ static inline uniform float round(uniform float x) { return __round_uniform_float(x); } +static inline double round(double x) { + return __round_varying_double(x); +} + +static inline uniform double round(uniform double x) { + return __round_uniform_double(x); +} + static inline float floor(float x) { return __floor_varying_float(x); } @@ -451,6 +577,14 @@ static inline uniform float floor(uniform float x) { return __floor_uniform_float(x); } +static inline double floor(double x) { + return __floor_varying_double(x); +} + +static inline uniform double floor(uniform double x) { + return __floor_uniform_double(x); +} + static inline float ceil(float x) { return __ceil_varying_float(x); } @@ -459,6 +593,14 @@ static inline uniform float ceil(uniform float x) { return __ceil_uniform_float(x); } +static inline double ceil(double x) { + return __ceil_varying_double(x); +} + +static inline uniform double ceil(uniform double x) { + return __ceil_uniform_double(x); +} + static inline float rcp(float v) { return __rcp_varying_float(v); } @@ -467,14 +609,6 @@ static inline uniform float rcp(uniform float v) { return __rcp_uniform_float(v); } -static inline float sqrt(float v) { - return __sqrt_varying_float(v); -} - -static inline uniform float sqrt(uniform float v) { - return __sqrt_uniform_float(v); -} - static inline float min(float a, float b) { return __min_varying_float(a, b); } @@ -483,6 +617,14 @@ static inline uniform float min(uniform float a, uniform float b) { return __min_uniform_float(a, b); } +static inline double min(double a, double b) { + return __min_varying_double(a, b); +} + +static inline uniform double min(uniform double a, uniform double b) { + return __min_uniform_double(a, b); +} + static inline float max(float a, float b) { return __max_varying_float(a, b); } @@ -491,6 +633,14 @@ static inline uniform float max(uniform float a, uniform float b) { return __max_uniform_float(a, b); } +static inline double max(double a, double b) { + return __max_varying_double(a, b); +} + +static inline uniform double max(uniform double a, uniform double b) { + return __max_uniform_double(a, b); +} + static inline unsigned int min(unsigned int a, unsigned int b) { return __min_varying_uint32(a, b); } @@ -523,6 +673,38 @@ static inline uniform int max(uniform int a, uniform int b) { return __max_uniform_int32(a, b); } +static inline unsigned int64 min(unsigned int64 a, unsigned int64 b) { + return __min_varying_uint64(a, b); +} + +static inline uniform unsigned int64 min(uniform unsigned int64 a, uniform unsigned int64 b) { + return __min_uniform_uint64(a, b); +} + +static inline unsigned int64 max(unsigned int64 a, unsigned int64 b) { + return __max_varying_uint64(a, b); +} + +static inline uniform unsigned int64 max(uniform unsigned int64 a, uniform unsigned int64 b) { + return __max_uniform_uint64(a, b); +} + +static inline int64 min(int64 a, int64 b) { + return __min_varying_int64(a, b); +} + +static inline uniform int64 min(uniform int64 a, uniform int64 b) { + return __min_uniform_int64(a, b); +} + +static inline int64 max(int64 a, int64 b) { + return __max_varying_int64(a, b); +} + +static inline uniform int64 max(uniform int64 a, uniform int64 b) { + return __max_uniform_int64(a, b); +} + static inline float clamp(float v, float low, float high) { return min(max(v, low), high); } @@ -536,7 +718,16 @@ static inline unsigned int clamp(unsigned int v, unsigned int low, unsigned int } static inline uniform unsigned int clamp(uniform unsigned int v, uniform unsigned int low, - uniform unsigned int high) { + uniform unsigned int high) { + return min(max(v, low), high); +} + +static inline unsigned int64 clamp(unsigned int64 v, unsigned int64 low, unsigned int64 high) { + return min(max(v, low), high); +} + +static inline uniform unsigned int64 clamp(uniform unsigned int64 v, uniform unsigned int64 low, + uniform unsigned int64 high) { return min(max(v, low), high); } @@ -548,8 +739,24 @@ static inline uniform int clamp(uniform int v, uniform int low, uniform int high return min(max(v, low), high); } +static inline int64 clamp(int64 v, int64 low, int64 high) { + return min(max(v, low), high); +} + +static inline uniform int64 clamp(uniform int64 v, uniform int64 low, uniform int64 high) { + return min(max(v, low), high); +} + /////////////////////////////////////////////////////////////////////////// -// Transcendentals +// Transcendentals (float precision) + +static inline float sqrt(float v) { + return __sqrt_varying_float(v); +} + +static inline uniform float sqrt(uniform float v) { + return __sqrt_uniform_float(v); +} static inline float rsqrt(float v) { return __rsqrt_varying_float(v); @@ -612,7 +819,7 @@ static inline float sin(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_sin(extract(x_full, i)); + uniform float r = __stdlib_sinf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -669,7 +876,7 @@ static inline float sin(float x_full) { static inline uniform float sin(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_sin(x_full); + return __stdlib_sinf(x_full); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -741,7 +948,7 @@ static inline float cos(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_cos(extract(x_full, i)); + uniform float r = __stdlib_cosf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -797,7 +1004,7 @@ static inline float cos(float x_full) { static inline uniform float cos(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_cos(x_full); + return __stdlib_cosf(x_full); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -868,7 +1075,7 @@ static inline void sincos(float x_full, reference float sin_result, reference fl if ((mask & (1 << i)) == 0) continue; uniform float s, c; - __stdlib_sincos(extract(x_full, i), s, c); + __stdlib_sincosf(extract(x_full, i), s, c); sin_result = insert(sin_result, i, s); cos_result = insert(cos_result, i, c); } @@ -930,10 +1137,10 @@ static inline void sincos(float x_full, reference float sin_result, reference fl static inline void sincos(uniform float x_full, reference uniform float sin_result, - reference uniform float cos_result) { + reference uniform float cos_result) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - __stdlib_sincos(x_full, sin_result, cos_result); + __stdlib_sincosf(x_full, sin_result, cos_result); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -1001,7 +1208,7 @@ static inline float tan(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_tan(extract(x_full, i)); + uniform float r = __stdlib_tanf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -1075,7 +1282,7 @@ static inline float tan(float x_full) { static inline uniform float tan(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_tan(x_full); + return __stdlib_tanf(x_full); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -1153,7 +1360,7 @@ static inline float atan(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_atan(extract(x_full, i)); + uniform float r = __stdlib_atanf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -1199,7 +1406,7 @@ static inline float atan(float x_full) { static inline uniform float atan(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_atan(x_full); + return __stdlib_atanf(x_full); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -1249,7 +1456,7 @@ static inline float atan2(float y, float x) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_atan2(extract(y, i), extract(x, i)); + uniform float r = __stdlib_atan2f(extract(y, i), extract(x, i)); ret = insert(ret, i, r); } return ret; @@ -1283,7 +1490,7 @@ static inline float atan2(float y, float x) { static inline uniform float atan2(uniform float y, uniform float x) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_atan2(y, x); + return __stdlib_atan2f(y, x); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -1310,7 +1517,7 @@ static inline float exp(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_exp(extract(x_full, i)); + uniform float r = __stdlib_expf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -1384,7 +1591,7 @@ static inline float exp(float x_full) { static inline uniform float exp(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_exp(x_full); + return __stdlib_expf(x_full); } else if (__math_lib == __math_lib_ispc_fast) { uniform float z = floor(1.44269504088896341f * x_full + 0.5f); @@ -1485,7 +1692,7 @@ static inline void __range_reduce_log(float input, reference float reduced, refe static inline void __range_reduce_log(uniform float input, reference uniform float reduced, - reference uniform int exponent) { + reference uniform int exponent) { uniform int int_version = intbits(input); static const uniform int nonexponent_mask = 0x807FFFFF; @@ -1509,7 +1716,7 @@ static inline float log(float x_full) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_log(extract(x_full, i)); + uniform float r = __stdlib_logf(extract(x_full, i)); ret = insert(ret, i, r); } return ret; @@ -1592,7 +1799,7 @@ static inline float log(float x_full) { static inline uniform float log(uniform float x_full) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_log(x_full); + return __stdlib_logf(x_full); } else if (__math_lib == __math_lib_ispc_fast) { uniform int e; @@ -1679,7 +1886,7 @@ static inline float pow(float a, float b) { for (uniform int i = 0; i < programCount; ++i) { if ((mask & (1 << i)) == 0) continue; - uniform float r = __stdlib_pow(extract(a, i), extract(b, i)); + uniform float r = __stdlib_powf(extract(a, i), extract(b, i)); ret = insert(ret, i, r); } return ret; @@ -1693,7 +1900,7 @@ static inline float pow(float a, float b) { static inline uniform float pow(uniform float a, uniform float b) { if (__math_lib == __math_lib_system || __math_lib == __math_lib_svml) { - return __stdlib_pow(a, b); + return __stdlib_powf(a, b); } else if (__math_lib == __math_lib_ispc || __math_lib == __math_lib_ispc_fast) { @@ -1701,6 +1908,273 @@ static inline uniform float pow(uniform float a, uniform float b) { } } +/////////////////////////////////////////////////////////////////////////// +// Transcendentals (double precision) + +static inline double sqrt(double v) { + return __sqrt_varying_double(v); +} + +static inline uniform double sqrt(uniform double v) { + return __sqrt_uniform_double(v); +} + +static inline double ldexp(double x, int n) { + unsigned int64 ex = 0x7ff0000000000000; + unsigned int64 ix = intbits(x); + ex &= ix; + ix = ix & ~0x7ff0000000000000; // clear exponent + int64 n64 = ((int64)n << 52) + ex; + ix |= n64; // insert new exponent + return doublebits(ix); +} + +static inline uniform double ldexp(uniform double x, uniform int n) { + uniform unsigned int64 ex = 0x7ff0000000000000; + uniform unsigned int64 ix = intbits(x); + ex &= ix; + ix = ix & ~0x7ff0000000000000; // clear exponent + uniform int n64 = ((int64)n << 52) + ex; + ix |= n64; // insert new exponent + return doublebits(ix); +} + +static inline double frexp(double x, reference int pw2) { + unsigned int64 ex = 0x7ff0000000000000; // exponent mask + unsigned int64 ix = intbits(x); + ex &= ix; + ix &= ~0x7ff0000000000000; // clear exponent + pw2 = (int)(ex >> 52) - 1022; // compute exponent + ix |= 0x3fe0000000000000; // insert exponent +1 in x + return doublebits(ix); +} + +static inline uniform double frexp(uniform double x, reference uniform int pw2) { + uniform unsigned int64 ex = 0x7ff0000000000000; // exponent mask + uniform unsigned int64 ix = intbits(x); + ex &= ix; + ix &= ~0x7ff0000000000000; // clear exponent + pw2 = (int)(ex >> 52) - 1022; // compute exponent + ix |= 0x3fe0000000000000; // insert exponent +1 in x + return doublebits(ix); +} + +static inline double sin(double x) { + if (__math_lib == __math_lib_ispc_fast) + return sin((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_sin(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double sin(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return sin((float)x); + else + return __stdlib_sin(x); +} + +static inline double cos(double x) { + if (__math_lib == __math_lib_ispc_fast) + return cos((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_cos(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double cos(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return cos((float)x); + else + return __stdlib_cos(x); +} + +static inline void sincos(double x, reference double sin_result, + reference double cos_result) { + if (__math_lib == __math_lib_ispc_fast) { + float sr, cr; + sincos((float)x, sr, cr); + sin_result = sr; + cos_result = cr; + } + else { + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + uniform double sr, cr; + if ((mask & (1 << i)) == 0) + continue; + __stdlib_sincos(extract(x, i), sr, cr); + sin_result = insert(sin_result, i, sr); + cos_result = insert(cos_result, i, cr); + } + } +} + +static inline void sincos(uniform double x, reference uniform double sin_result, + reference uniform double cos_result) { + if (__math_lib == __math_lib_ispc_fast) { + uniform float sr, cr; + sincos((uniform float)x, sr, cr); + sin_result = sr; + cos_result = cr; + } + else + __stdlib_sincos(x, sin_result, cos_result); +} + +static inline double tan(double x) { + if (__math_lib == __math_lib_ispc_fast) + return tan((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_tan(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double tan(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return tan((float)x); + else + return __stdlib_tan(x); +} + +static inline double atan(double x) { + if (__math_lib == __math_lib_ispc_fast) + return atan((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_atan(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double atan(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return atan((float)x); + else + return __stdlib_atan(x); +} + +static inline double atan2(double y, double x) { + if (__math_lib == __math_lib_ispc_fast) + return atan2((float)y, (float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_atan2(extract(y, i), extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double atan2(uniform double y, uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return atan2((float)y, (float)x); + else + return __stdlib_atan2(y, x); +} + +static inline double exp(double x) { + if (__math_lib == __math_lib_ispc_fast) + return exp((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_exp(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double exp(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return exp((float)x); + else + return __stdlib_exp(x); +} + +static inline double log(double x) { + if (__math_lib == __math_lib_ispc_fast) + return log((float)x); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_log(extract(x, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double log(uniform double x) { + if (__math_lib == __math_lib_ispc_fast) + return log((float)x); + else + return __stdlib_log(x); +} + +static inline double pow(double a, double b) { + if (__math_lib == __math_lib_ispc_fast) + return pow((float)a, (float)b); + else { + double ret; + uniform int mask = lanemask(); + for (uniform int i = 0; i < programCount; ++i) { + if ((mask & (1 << i)) == 0) + continue; + uniform double r = __stdlib_pow(extract(a, i), extract(b, i)); + ret = insert(ret, i, r); + } + return ret; + } +} + +static inline uniform double pow(uniform double a, uniform double b) { + if (__math_lib == __math_lib_ispc_fast) + return pow((float)a, (float)b); + else + return __stdlib_pow(a, b); +} /////////////////////////////////////////////////////////////////////////// // RNG stuff @@ -1709,7 +2183,7 @@ struct RNGState { unsigned int z1, z2, z3, z4; }; -static inline unsigned int random(reference uniform RNGState state) +static inline unsigned int random(reference RNGState state) { unsigned int b; @@ -1724,14 +2198,14 @@ static inline unsigned int random(reference uniform RNGState state) return (state.z1 ^ state.z2 ^ state.z3 ^ state.z4); } -static inline float frandom(reference uniform RNGState state) +static inline float frandom(reference RNGState state) { return ((int)(random(state) & ((1<<24)-1))) / (float)(1 << 24); } -static inline uniform unsigned int __seed4(reference uniform RNGState state, - uniform int start, - uniform unsigned int seed) { +static inline uniform unsigned int __seed4(reference RNGState state, + uniform int start, + uniform unsigned int seed) { uniform unsigned int c1 = 0xf0f0f0f0; uniform unsigned int c2 = 0x0f0f0f0f; diff --git a/stdlib.m4 b/stdlib.m4 index 385d19e0..6b781b17 100644 --- a/stdlib.m4 +++ b/stdlib.m4 @@ -136,6 +136,26 @@ define(`reduce8by4', ` ) +;; Apply a unary function to the 4-vector in %0, return the vector result. +;; $1: scalar type of result +;; $2: name of scalar function to call + +define(`unary1to4', ` + %v_0 = extractelement <4 x $1> %0, i32 0 + %r_0 = call $1 $2($1 %v_0) + %ret_0 = insertelement <4 x $1> undef, $1 %r_0, i32 0 + %v_1 = extractelement <4 x $1> %0, i32 1 + %r_1 = call $1 $2($1 %v_1) + %ret_1 = insertelement <4 x $1> %ret_0, $1 %r_1, i32 1 + %v_2 = extractelement <4 x $1> %0, i32 2 + %r_2 = call $1 $2($1 %v_2) + %ret_2 = insertelement <4 x $1> %ret_1, $1 %r_2, i32 2 + %v_3 = extractelement <4 x $1> %0, i32 3 + %r_3 = call $1 $2($1 %v_3) + %ret_3 = insertelement <4 x $1> %ret_2, $1 %r_3, i32 3 + ret <4 x $1> %ret_3 +') + ;; Given a unary function that takes a 2-wide vector and a 4-wide vector ;; that we'd like to apply it to, extract 2 2-wide vectors from the 4-wide ;; vector, apply it, and return the corresponding 4-wide vector result @@ -286,6 +306,49 @@ ret <8 x float> %ret ' ) +define(`round4to8double', ` +%v0 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> +%v1 = shufflevector <8 x double> $1, <8 x double> undef, <4 x i32> +%r0 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v0, i32 $2) +%r1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %v1, i32 $2) +%ret = shufflevector <4 x double> %r0, <4 x double> %r1, + <8 x i32> +ret <8 x double> %ret +' +) + +; and similarly for doubles... + +define(`round2to4double', ` +%v0 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> +%v1 = shufflevector <4 x double> $1, <4 x double> undef, <2 x i32> +%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) +%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) +%ret = shufflevector <2 x double> %r0, <2 x double> %r1, + <4 x i32> +ret <4 x double> %ret +' +) + +define(`round2to8double', ` +%v0 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v1 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v2 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%v3 = shufflevector <8 x double> $1, <8 x double> undef, <2 x i32> +%r0 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v0, i32 $2) +%r1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v1, i32 $2) +%r2 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v2, i32 $2) +%r3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %v3, i32 $2) +%ret0 = shufflevector <2 x double> %r0, <2 x double> %r1, + <4 x i32> +%ret1 = shufflevector <2 x double> %r2, <2 x double> %r3, + <4 x i32> +%ret = shufflevector <4 x double> %ret0, <4 x double> %ret1, + <8 x i32> +ret <8 x double> %ret +' +) + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; forloop macro @@ -503,15 +566,26 @@ declare i1 @__is_compile_time_constant_varying_int32(<$1 x i32>) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops -define internal float @__extract(<$1 x float>, i32) nounwind readnone alwaysinline { - %extract = extractelement <$1 x float> %0, i32 %1 - ret float %extract +define internal i32 @__extract_int32(<$1 x i32>, i32) nounwind readnone alwaysinline { + %extract = extractelement <$1 x i32> %0, i32 %1 + ret i32 %extract } -define internal <$1 x float> @__insert(<$1 x float>, i32, - float) nounwind readnone alwaysinline { - %insert = insertelement <$1 x float> %0, float %2, i32 %1 - ret <$1 x float> %insert +define internal <$1 x i32> @__insert_int32(<$1 x i32>, i32, + i32) nounwind readnone alwaysinline { + %insert = insertelement <$1 x i32> %0, i32 %2, i32 %1 + ret <$1 x i32> %insert +} + +define internal i64 @__extract_int64(<$1 x i64>, i32) nounwind readnone alwaysinline { + %extract = extractelement <$1 x i64> %0, i32 %1 + ret i64 %extract +} + +define internal <$1 x i64> @__insert_int64(<$1 x i64>, i32, + i64) nounwind readnone alwaysinline { + %insert = insertelement <$1 x i64> %0, i64 %2, i32 %1 + ret <$1 x i64> %insert } shuffles($1, float, float, 4) @@ -588,51 +662,106 @@ declare float @expf(float) nounwind readnone declare float @logf(float) nounwind readnone declare float @powf(float, float) nounwind readnone -define internal float @__stdlib_sin(float) nounwind readnone alwaysinline { +define internal float @__stdlib_sinf(float) nounwind readnone alwaysinline { %r = call float @sinf(float %0) ret float %r } -define internal float @__stdlib_cos(float) nounwind readnone alwaysinline { +define internal float @__stdlib_cosf(float) nounwind readnone alwaysinline { %r = call float @cosf(float %0) ret float %r } -define internal void @__stdlib_sincos(float, float *, float *) nounwind readnone alwaysinline { +define internal void @__stdlib_sincosf(float, float *, float *) nounwind readnone alwaysinline { call void @sincosf(float %0, float *%1, float *%2) ret void } -define internal float @__stdlib_tan(float) nounwind readnone alwaysinline { +define internal float @__stdlib_tanf(float) nounwind readnone alwaysinline { %r = call float @tanf(float %0) ret float %r } -define internal float @__stdlib_atan(float) nounwind readnone alwaysinline { +define internal float @__stdlib_atanf(float) nounwind readnone alwaysinline { %r = call float @atanf(float %0) ret float %r } -define internal float @__stdlib_atan2(float, float) nounwind readnone alwaysinline { +define internal float @__stdlib_atan2f(float, float) nounwind readnone alwaysinline { %r = call float @atan2f(float %0, float %1) ret float %r } -define internal float @__stdlib_log(float) nounwind readnone alwaysinline { +define internal float @__stdlib_logf(float) nounwind readnone alwaysinline { %r = call float @logf(float %0) ret float %r } -define internal float @__stdlib_exp(float) nounwind readnone alwaysinline { +define internal float @__stdlib_expf(float) nounwind readnone alwaysinline { %r = call float @expf(float %0) ret float %r } -define internal float @__stdlib_pow(float, float) nounwind readnone alwaysinline { +define internal float @__stdlib_powf(float, float) nounwind readnone alwaysinline { %r = call float @powf(float %0, float %1) ret float %r } +declare double @sin(double) nounwind readnone +declare double @cos(double) nounwind readnone +declare void @sincos(double, double *, double *) nounwind readnone +declare double @tan(double) nounwind readnone +declare double @atan(double) nounwind readnone +declare double @atan2(double, double) nounwind readnone +declare double @exp(double) nounwind readnone +declare double @log(double) nounwind readnone +declare double @pow(double, double) nounwind readnone + +define internal double @__stdlib_sin(double) nounwind readnone alwaysinline { + %r = call double @sin(double %0) + ret double %r +} + +define internal double @__stdlib_cos(double) nounwind readnone alwaysinline { + %r = call double @cos(double %0) + ret double %r +} + +define internal void @__stdlib_sincos(double, double *, double *) nounwind readnone alwaysinline { + call void @sincos(double %0, double *%1, double *%2) + ret void +} + +define internal double @__stdlib_tan(double) nounwind readnone alwaysinline { + %r = call double @tan(double %0) + ret double %r +} + +define internal double @__stdlib_atan(double) nounwind readnone alwaysinline { + %r = call double @atan(double %0) + ret double %r +} + +define internal double @__stdlib_atan2(double, double) nounwind readnone alwaysinline { + %r = call double @atan2(double %0, double %1) + ret double %r +} + +define internal double @__stdlib_log(double) nounwind readnone alwaysinline { + %r = call double @log(double %0) + ret double %r +} + +define internal double @__stdlib_exp(double) nounwind readnone alwaysinline { + %r = call double @exp(double %0) + ret double %r +} + +define internal double @__stdlib_pow(double, double) nounwind readnone alwaysinline { + %r = call double @pow(double %0, double %1) + ret double %r +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; atomics and memory barriers @@ -676,6 +805,52 @@ global_atomic_exchange($1, i64, int64) ') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 64-bit integer min and max functions + +;; utility function used by int64minmax below. This shouldn't be called by +;; target .ll files directly. +;; $1: target vector width +;; $2: {min,max} (used in constructing function names) +;; $3: {int64,uint64} (used in constructing function names) +;; $4: {slt,sgt} comparison operator to used + +define(`i64minmax', ` +define internal i64 @__$2_uniform_$3(i64, i64) nounwind alwaysinline readnone { + %c = icmp $4 i64 %0, %1 + %r = select i1 %c, i64 %0, i64 %1 + ret i64 %r +} + +define internal <$1 x i64> @__$2_varying_$3(<$1 x i64>, <$1 x i64>) nounwind alwaysinline readnone { + %rptr = alloca <$1 x i64> + %r64ptr = bitcast <$1 x i64> * %rptr to i64 * + + forloop(i, 0, eval($1-1), ` + %v0_`'i = extractelement <$1 x i64> %0, i32 i + %v1_`'i = extractelement <$1 x i64> %1, i32 i + %c_`'i = icmp $4 i64 %v0_`'i, %v1_`'i + %v_`'i = select i1 %c_`'i, i64 %v0_`'i, i64 %v1_`'i + %ptr_`'i = getelementptr i64 * %r64ptr, i32 i + store i64 %v_`'i, i64 * %ptr_`'i +') + + %ret = load <$1 x i64> * %rptr + ret <$1 x i64> %ret +} +') + +;; this is the function that target .ll files should call; it just takes the target +;; vector width as a parameter + +define(`int64minmax', ` +i64minmax($1,min,int64,slt) +i64minmax($1,max,int64,sgt) +i64minmax($1,min,uint64,ult) +i64minmax($1,max,uint64,ugt) +') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Definitions of 8 and 16-bit load and store functions ;; diff --git a/tests/double-abs-1.ispc b/tests/double-abs-1.ispc new file mode 100644 index 00000000..f7a2ff28 --- /dev/null +++ b/tests/double-abs-1.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + RET[programIndex] = abs(-a); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/double-abs.ispc b/tests/double-abs.ispc new file mode 100644 index 00000000..1ea03678 --- /dev/null +++ b/tests/double-abs.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + RET[programIndex] = abs(a); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/double-max-1.ispc b/tests/double-max-1.ispc new file mode 100644 index 00000000..74ebce2c --- /dev/null +++ b/tests/double-max-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + double b = -2. * a; + RET[programIndex] = max(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 * (1 + programIndex); +} diff --git a/tests/double-max.ispc b/tests/double-max.ispc new file mode 100644 index 00000000..9f083fa7 --- /dev/null +++ b/tests/double-max.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + double b = 2. * a; + RET[programIndex] = max(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 2 * (1 + programIndex); +} diff --git a/tests/double-min-1.ispc b/tests/double-min-1.ispc new file mode 100644 index 00000000..ffe373cb --- /dev/null +++ b/tests/double-min-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + double b = -2. * a; + RET[programIndex] = min(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = -2 * (1 + programIndex); +} diff --git a/tests/double-min.ispc b/tests/double-min.ispc new file mode 100644 index 00000000..664d410d --- /dev/null +++ b/tests/double-min.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + double b = 2. * a; + RET[programIndex] = min(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/double-sqrt.ispc b/tests/double-sqrt.ispc new file mode 100644 index 00000000..41cfa8b6 --- /dev/null +++ b/tests/double-sqrt.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = aFOO[programIndex]; + if (programIndex & 1) { + a *= a; + RET[programIndex] = sqrt(a); + } + else + RET[programIndex] = a; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/extract-1.ispc b/tests/extract-1.ispc new file mode 100644 index 00000000..220107cb --- /dev/null +++ b/tests/extract-1.ispc @@ -0,0 +1,11 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = programIndex; + RET[programIndex] = extract(a, 3); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 3; +} diff --git a/tests/frexp-double-1.ispc b/tests/frexp-double-1.ispc new file mode 100644 index 00000000..96890dd9 --- /dev/null +++ b/tests/frexp-double-1.ispc @@ -0,0 +1,16 @@ + +export uniform int width() { return programCount; } + + +export void f_f(uniform float RET[], uniform float aFOO[]) { + double a = (1<= 3) + m = reduce_max(-v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = -3; } diff --git a/tests/reduce-max-int64.ispc b/tests/reduce-max-int64.ispc new file mode 100644 index 00000000..08641a9a --- /dev/null +++ b/tests/reduce-max-int64.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + if (v >= 3) + m = reduce_max(-(int64)v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = -3; } diff --git a/tests/reduce-max-uint64.ispc b/tests/reduce-max-uint64.ispc new file mode 100644 index 00000000..ce5e52d4 --- /dev/null +++ b/tests/reduce-max-uint64.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + if (v < 3) + m = reduce_max((unsigned int64)v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = 2; } diff --git a/tests/reduce-min-double.ispc b/tests/reduce-min-double.ispc new file mode 100644 index 00000000..866aa57e --- /dev/null +++ b/tests/reduce-min-double.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + double v = aFOO[programIndex]; + uniform float m; + if (v > 0 && v < 3) + m = reduce_min(-v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = -2; } diff --git a/tests/reduce-min-int.ispc b/tests/reduce-min-int.ispc index 4f7bf0b8..b943f323 100644 --- a/tests/reduce-min-int.ispc +++ b/tests/reduce-min-int.ispc @@ -7,7 +7,7 @@ export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { float v = aFOO[programIndex]; uniform float m; if (v >= 0 && v < 2) - m = reduce_min(-v); + m = reduce_min(-(int)v); RET[programIndex] = m; } diff --git a/tests/reduce-min-int64.ispc b/tests/reduce-min-int64.ispc new file mode 100644 index 00000000..0fa74e61 --- /dev/null +++ b/tests/reduce-min-int64.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + if (v >= 0 && v < 2) + m = reduce_min(-(int64)v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = -1; } diff --git a/tests/reduce-min-uint64.ispc b/tests/reduce-min-uint64.ispc new file mode 100644 index 00000000..2290dfd6 --- /dev/null +++ b/tests/reduce-min-uint64.ispc @@ -0,0 +1,14 @@ + +export uniform int width() { return programCount; } + + + +export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { + float v = aFOO[programIndex]; + uniform float m; + if (v >= 3) + m = reduce_min((unsigned int64)v); + RET[programIndex] = m; +} + +export void result(uniform float RET[]) { RET[programIndex] = 3; } diff --git a/tests/uint64-max-1.ispc b/tests/uint64-max-1.ispc new file mode 100644 index 00000000..086fc5ff --- /dev/null +++ b/tests/uint64-max-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned int64 a = aFOO[programIndex]; + unsigned int64 b = 2; + RET[programIndex] = max(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = max(2, 1 + programIndex); +} diff --git a/tests/uint64-max.ispc b/tests/uint64-max.ispc new file mode 100644 index 00000000..c765ef7f --- /dev/null +++ b/tests/uint64-max.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned int64 a = aFOO[programIndex]; + unsigned int64 b = 2. * a; + RET[programIndex] = max(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 2 * (1 + programIndex); +} diff --git a/tests/uint64-min-1.ispc b/tests/uint64-min-1.ispc new file mode 100644 index 00000000..750098d2 --- /dev/null +++ b/tests/uint64-min-1.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned int64 a = aFOO[programIndex]; + unsigned int64 b = 2 * a; + RET[programIndex] = min(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = (1 + programIndex); +} diff --git a/tests/uint64-min.ispc b/tests/uint64-min.ispc new file mode 100644 index 00000000..254846eb --- /dev/null +++ b/tests/uint64-min.ispc @@ -0,0 +1,12 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { + unsigned int64 a = aFOO[programIndex]; + unsigned int64 b = 2. * a; + RET[programIndex] = min(a,b); +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +}