target-[sse4|avx]_common.ll are twin brothers, which diffes only cosmetically. This commit makes them diffable. No real changes, except adding alwaysinline to sse version iof __max_uniform_int32/__max_uniform_uint32

This commit is contained in:
Dmitry Babokin
2013-11-12 10:00:42 +04:00
parent ffc9a33933
commit af58955140
2 changed files with 80 additions and 75 deletions

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -37,24 +37,6 @@ define_prefetches()
define_shuffles() define_shuffles()
aossoa() aossoa()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats ;; rounding floats
@@ -77,7 +59,8 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
; r3 = a3 ; r3 = a3
; ;
; It doesn't matter what we pass as a, since we only need the r0 value ; It doesn't matter what we pass as a, since we only need the r0 value
; here. So we pass the same register for both. ; here. So we pass the same register for both. Further, only the 0th
; element of the b parameter matters
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 8)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
@@ -117,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -126,12 +109,31 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp
declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0
; do one N-R iteration to improve precision, as above
%v_iv = fmul float %0, %scall
%two_minus = fsub float 2., %v_iv
%iv_mul = fmul float %scall, %two_minus
ret float %iv_mul
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rsqrt ;; rsqrt
@@ -144,6 +146,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
%vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v) %vis = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %v)
%is = extractelement <4 x float> %vis, i32 0 %is = extractelement <4 x float> %vis, i32 0
; Newton-Raphson iteration to improve precision
; return 0.5 * is * (3. - (v * is) * is); ; return 0.5 * is * (3. - (v * is) * is);
%v_is = fmul float %0, %is %v_is = fmul float %0, %is
%v_is_is = fmul float %v_is, %is %v_is_is = fmul float %v_is, %is
@@ -164,9 +167,18 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fastmath ;; fast math mode
declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind declare void @llvm.x86.sse.stmxcsr(i8 *) nounwind
declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind declare void @llvm.x86.sse.ldmxcsr(i8 *) nounwind
@@ -200,6 +212,22 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max ;; int min/max
@@ -235,7 +263,7 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone
@@ -251,32 +279,6 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call ret i64 %call
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret
}
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int8/int16 builtins ;; int8/int16 builtins

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation ;; Copyright (c) 2010-2013, Intel Corporation
;; All rights reserved. ;; All rights reserved.
;; ;;
;; Redistribution and use in source and binary forms, with or without ;; Redistribution and use in source and binary forms, with or without
@@ -29,6 +29,9 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; SSE4 target implementation.
ctlztz() ctlztz()
define_prefetches() define_prefetches()
define_shuffles() define_shuffles()
@@ -67,7 +70,7 @@ define float @__round_uniform_float(float) nounwind readonly alwaysinline {
define float @__floor_uniform_float(float) nounwind readonly alwaysinline { define float @__floor_uniform_float(float) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <4 x float> undef, float %0, i32 0 %xi = insertelement <4 x float> undef, float %0, i32 0
; roundps, round down 0b01 | don't signal precision exceptions 0b1010 = 9 ; roundps, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9) %xr = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %xi, <4 x float> %xi, i32 9)
%rs = extractelement <4 x float> %xr, i32 0 %rs = extractelement <4 x float> %xr, i32 0
ret float %rs ret float %rs
@@ -97,7 +100,7 @@ define double @__round_uniform_double(double) nounwind readonly alwaysinline {
define double @__floor_uniform_double(double) nounwind readonly alwaysinline { define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundpd, round down 0b01 | don't signal precision exceptions 0b1001 = 9 ; roundsd, round down 0b01 | don't signal precision exceptions 0b1001 = 9
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 9)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -106,7 +109,7 @@ define double @__floor_uniform_double(double) nounwind readonly alwaysinline {
define double @__ceil_uniform_double(double) nounwind readonly alwaysinline { define double @__ceil_uniform_double(double) nounwind readonly alwaysinline {
; see above for round_ss instrinsic discussion... ; see above for round_ss instrinsic discussion...
%xi = insertelement <2 x double> undef, double %0, i32 0 %xi = insertelement <2 x double> undef, double %0, i32 0
; roundps, round up 0b10 | don't signal precision exceptions 0b1010 = 10 ; roundsd, round up 0b10 | don't signal precision exceptions 0b1010 = 10
%xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10) %xr = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %xi, <2 x double> %xi, i32 10)
%rs = extractelement <2 x double> %xr, i32 0 %rs = extractelement <2 x double> %xr, i32 0
ret double %rs ret double %rs
@@ -119,6 +122,8 @@ declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
define float @__rcp_uniform_float(float) nounwind readonly alwaysinline { define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
; do the rcpss call ; do the rcpss call
; uniform float iv = extract(__rcp_u(v), 0);
; return iv * (2. - v * iv);
%vecval = insertelement <4 x float> undef, float %0, i32 0 %vecval = insertelement <4 x float> undef, float %0, i32 0
%call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval) %call = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %vecval)
%scall = extractelement <4 x float> %call, i32 0 %scall = extractelement <4 x float> %call, i32 0
@@ -130,9 +135,8 @@ define float @__rcp_uniform_float(float) nounwind readonly alwaysinline {
ret float %iv_mul ret float %iv_mul
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; rsqrt ;; rsqrt
declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
@@ -154,7 +158,7 @@ define float @__rsqrt_uniform_float(float) nounwind readonly alwaysinline {
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; sqrt ;; sqrt
declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
@@ -163,6 +167,16 @@ define float @__sqrt_uniform_float(float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; fast math mode ;; fast math mode
@@ -198,36 +212,25 @@ define float @__min_uniform_float(float, float) nounwind readonly alwaysinline {
ret float %ret ret float %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision sqrt
declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
define double @__sqrt_uniform_double(double) nounwind alwaysinline {
sse_unary_scalar(ret, 2, double, @llvm.x86.sse2.sqrt.sd, %0)
ret double %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double precision min/max ;; double precision min/max
declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
define double @__min_uniform_double(double, double) nounwind readnone { define double @__min_uniform_double(double, double) nounwind readnone alwaysinline {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.min.sd, %0, %1)
ret double %ret ret double %ret
} }
define double @__max_uniform_double(double, double) nounwind readnone alwaysinline {
define double @__max_uniform_double(double, double) nounwind readnone {
sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1) sse_binary_scalar(ret, 2, double, @llvm.x86.sse2.max.sd, %0, %1)
ret double %ret ret double %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 min/max ;; int min/max
declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
@@ -242,8 +245,9 @@ define i32 @__max_uniform_int32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; unsigned int min/max ;; unsigned int min/max
declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
@@ -258,9 +262,8 @@ define i32 @__max_uniform_uint32(i32, i32) nounwind readonly alwaysinline {
ret i32 %ret ret i32 %ret
} }
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
; horizontal ops / reductions ;; horizontal ops / reductions
declare i32 @llvm.ctpop.i32(i32) nounwind readnone declare i32 @llvm.ctpop.i32(i32) nounwind readnone