Add reduce_add() for int8 and int16 types.

This maps to specialized instructions (e.g. PSADBW) when available.
This commit is contained in:
Matt Pharr
2013-07-25 09:11:39 -07:00
parent 2d063925a1
commit b6df447b55
24 changed files with 464 additions and 44 deletions

View File

@@ -271,6 +271,33 @@ reduce_equal(16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <16 x i32> @__add_varying_int32(<16 x i32>,
<16 x i32>) nounwind readnone alwaysinline {
%s = add <16 x i32> %0, %1

View File

@@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
ret float %sum
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float)
}
@@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
@@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint32 ops
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
@@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint64 ops
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}

View File

@@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call
}
define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i8> %v, i32 0
ret i8 %r
}
define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i16> %v, i32 0
ret i16 %r
}
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x float> %v, i32 0

View File

@@ -226,14 +226,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
@@ -244,7 +246,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone

View File

@@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
}
define internal i32 @add_i32(i32, i32) {
%r = add i32 %0, %1
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
%v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
%r16 = trunc i32 %r to i16
ret i16 %r16
}
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
ret i32 %r
}
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
%a0 = extractelement <2 x i64> %a64, i32 0
%a1 = extractelement <2 x i64> %a64, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

View File

@@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <4 x float> @__vec4_add_float(<4 x float> %v0,
<4 x float> %v1) nounwind readnone alwaysinline {
%v = fadd <4 x float> %v0, %v1

View File

@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>

View File

@@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
%r = fadd <8 x float> %0, %1
ret <8 x float> %r

View File

@@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
%r = fadd <16 x float> %0, %1
ret <16 x float> %r

View File

@@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
}

View File

@@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {