Add reduce_add() for int8 and int16 types.

This maps to specialized instructions (e.g. PSADBW) when available.
This commit is contained in:
Matt Pharr
2013-07-25 09:11:39 -07:00
parent 2d063925a1
commit b6df447b55
24 changed files with 464 additions and 44 deletions

View File

@@ -501,6 +501,8 @@ lSetInternalFunctions(llvm::Module *module) {
"__rdrand_i64",
"__reduce_add_double",
"__reduce_add_float",
"__reduce_add_int8",
"__reduce_add_int16",
"__reduce_add_int32",
"__reduce_add_int64",
"__reduce_equal_double",

View File

@@ -271,6 +271,33 @@ reduce_equal(16)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <16 x i32> @__add_varying_int32(<16 x i32>,
<16 x i32>) nounwind readnone alwaysinline {
%s = add <16 x i32> %0, %1

View File

@@ -217,7 +217,6 @@ define float @__reduce_add_float(<8 x float>) nounwind readonly alwaysinline {
ret float %sum
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8(float, @__min_varying_float, @__min_uniform_float)
}
@@ -229,6 +228,42 @@ define float @__reduce_max_float(<8 x float>) nounwind readnone alwaysinline {
reduce_equal(8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int8 ops
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int16 ops
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal int32 ops
@@ -257,20 +292,14 @@ define i32 @__reduce_max_int32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_int32, @__max_uniform_int32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint32 ops
define i32 @__reduce_min_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__min_varying_uint32, @__min_uniform_uint32)
}
define i32 @__reduce_max_uint32(<8 x i32>) nounwind readnone alwaysinline {
reduce8(i32, @__max_varying_uint32, @__max_uniform_uint32)
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; horizontal double ops
@@ -329,9 +358,6 @@ define i64 @__reduce_max_int64(<8 x i64>) nounwind readnone alwaysinline {
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; horizontal uint64 ops
define i64 @__reduce_min_uint64(<8 x i64>) nounwind readnone alwaysinline {
reduce8(i64, @__min_varying_uint64, @__min_uniform_uint64)
}

View File

@@ -471,6 +471,15 @@ define i64 @__popcnt_int64(i64) nounwind readonly alwaysinline {
ret i64 %call
}
define i8 @__reduce_add_int8(<1 x i8> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i8> %v, i32 0
ret i8 %r
}
define i16 @__reduce_add_int16(<1 x i16> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x i16> %v, i32 0
ret i16 %r
}
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
%r = extractelement <1 x float> %v, i32 0

View File

@@ -226,14 +226,16 @@ declare i1 @__any(<WIDTH x i1>) nounwind readnone
declare i1 @__all(<WIDTH x i1>) nounwind readnone
declare i1 @__none(<WIDTH x i1>) nounwind readnone
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
declare float @__reduce_add_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_min_float(<WIDTH x float>) nounwind readnone
declare float @__reduce_max_float(<WIDTH x float>) nounwind readnone
declare i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_int32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_min_uint32(<WIDTH x i32>) nounwind readnone
declare i32 @__reduce_max_uint32(<WIDTH x i32>) nounwind readnone
@@ -244,7 +246,6 @@ declare double @__reduce_max_double(<WIDTH x double>) nounwind readnone
declare i64 @__reduce_add_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_int64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_min_uint64(<WIDTH x i64>) nounwind readnone
declare i64 @__reduce_max_uint64(<WIDTH x i64>) nounwind readnone

View File

@@ -509,15 +509,38 @@ define float @__reduce_max_float(<4 x float>) nounwind readnone {
neon_reduce(float, @llvm.arm.neon.vpmaxs.v2f32, @max_f32)
}
define internal i32 @add_i32(i32, i32) {
%r = add i32 %0, %1
declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone {
%v8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
%a16 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %v8)
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %a16)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
%r16 = trunc i32 %r to i16
ret i16 %r16
}
declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
define i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone {
%a32 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %0)
%a0 = extractelement <2 x i32> %a32, i32 0
%a1 = extractelement <2 x i32> %a32, i32 1
%r = add i32 %a0, %a1
ret i32 %r
}
declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
define i32 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
neon_reduce(i32, @llvm.arm.neon.vpadd.v2i32, @add_i32)
define i64 @__reduce_add_int32(<WIDTH x i32>) nounwind readnone {
%a64 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %0)
%a0 = extractelement <2 x i64> %a64, i32 0
%a1 = extractelement <2 x i64> %a64, i32 1
%r = add i64 %a0, %a1
ret i64 %r
}
declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone

View File

@@ -367,6 +367,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define <4 x float> @__vec4_add_float(<4 x float> %v0,
<4 x float> %v1) nounwind readnone alwaysinline {
%v = fadd <4 x float> %v0, %v1

View File

@@ -267,6 +267,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_add_float(<4 x float> %v) nounwind readonly alwaysinline {
%v1 = shufflevector <4 x float> %v, <4 x float> undef,
<4 x i32> <i32 2, i32 3, i32 undef, i32 undef>

View File

@@ -253,6 +253,36 @@ define i1 @__none(<8 x MASK>) nounwind readnone alwaysinline {
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <8 x float> @__add_varying_float(<8 x float>, <8 x float>) {
%r = fadd <8 x float> %0, %1
ret <8 x float> %r

View File

@@ -261,6 +261,33 @@ define i1 @__none(<16 x i8>) nounwind readnone alwaysinline {
ret i1 %meq
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<16 x i8>) nounwind readnone alwaysinline {
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %0,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <16 x i16> @__add_varying_i16(<16 x i16>,
<16 x i16>) nounwind readnone alwaysinline {
%r = add <16 x i16> %0, %1
ret <16 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<16 x i16>) nounwind readnone alwaysinline {
reduce16(i16, @__add_varying_i16, @__add_uniform_i16)
}
define internal <16 x float> @__add_varying_float(<16 x float>, <16 x float>) {
%r = fadd <16 x float> %0, %1
ret <16 x float> %r

View File

@@ -309,6 +309,36 @@ define i1 @__none(<8 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<8 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <8 x i8> %0, <8 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <8 x i16> @__add_varying_i16(<8 x i16>,
<8 x i16>) nounwind readnone alwaysinline {
%r = add <8 x i16> %0, %1
ret <8 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<8 x i16>) nounwind readnone alwaysinline {
reduce8(i16, @__add_varying_i16, @__add_uniform_i16)
}
define float @__reduce_min_float(<8 x float>) nounwind readnone alwaysinline {
reduce8by4(float, @llvm.x86.sse.min.ps, @__min_uniform_float)
}

View File

@@ -299,6 +299,36 @@ define i1 @__none(<4 x i32>) nounwind readnone alwaysinline {
ret i1 %cmp
}
declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
define i16 @__reduce_add_int8(<4 x i8>) nounwind readnone alwaysinline {
%wide8 = shufflevector <4 x i8> %0, <4 x i8> zeroinitializer,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4,
i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
%rv = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %wide8,
<16 x i8> zeroinitializer)
%r0 = extractelement <2 x i64> %rv, i32 0
%r1 = extractelement <2 x i64> %rv, i32 1
%r = add i64 %r0, %r1
%r16 = trunc i64 %r to i16
ret i16 %r16
}
define internal <4 x i16> @__add_varying_i16(<4 x i16>,
<4 x i16>) nounwind readnone alwaysinline {
%r = add <4 x i16> %0, %1
ret <4 x i16> %r
}
define internal i16 @__add_uniform_i16(i16, i16) nounwind readnone alwaysinline {
%r = add i16 %0, %1
ret i16 %r
}
define i16 @__reduce_add_int16(<4 x i16>) nounwind readnone alwaysinline {
reduce4(i16, @__add_varying_i16, @__add_uniform_i16)
}
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
define float @__reduce_add_float(<4 x float>) nounwind readonly alwaysinline {

View File

@@ -3711,29 +3711,44 @@ instances are added together by the ``reduce_add()`` function.
::
uniform float reduce_add(float x)
uniform int reduce_add(int x)
uniform unsigned int reduce_add(unsigned int x)
uniform int16 reduce_add(int8 x)
uniform unsigned int16 reduce_add(unsigned int8 x)
uniform int32 reduce_add(int16 x)
uniform unsigned int32 reduce_add(unsigned 16int x)
uniform int64 reduce_add(int32 x)
uniform unsigned int64 reduce_add(unsigned int32 x)
uniform int64 reduce_add(int64 x)
uniform unsigned int64 reduce_add(unsigned int64 x)
You can also use functions to compute the minimum and maximum value of the
given value across all of the currently-executing program instances.
uniform float reduce_add(float x)
uniform double reduce_add(double x)
You can also use functions to compute the minimum value of the given value
across all of the currently-executing program instances.
::
uniform float reduce_min(float a)
uniform int32 reduce_min(int32 a)
uniform unsigned int32 reduce_min(unsigned int32 a)
uniform double reduce_min(double a)
uniform int64 reduce_min(int64 a)
uniform unsigned int64 reduce_min(unsigned int64 a)
uniform float reduce_max(float a)
uniform float reduce_min(float a)
uniform double reduce_min(double a)
Equivalent functions are available to comptue the maximum of the given
varying variable over the active program instances.
::
uniform int32 reduce_max(int32 a)
uniform unsigned int32 reduce_max(unsigned int32 a)
uniform double reduce_max(double a)
uniform int64 reduce_max(int64 a)
uniform unsigned int64 reduce_max(unsigned int64 a)
uniform float reduce_max(float a)
uniform double reduce_max(double a)
Finally, you can check to see if a particular value has the same value in
all of the currently-running program instances:
@@ -3741,9 +3756,10 @@ all of the currently-running program instances:
uniform bool reduce_equal(int32 v)
uniform bool reduce_equal(unsigned int32 v)
uniform bool reduce_equal(float v)
uniform bool reduce_equal(int64 v)
uniform bool reduce_equal(unsigned int64 v)
uniform bool reduce_equal(float v)
uniform bool reduce_equal(double)
There are also variants of these functions that return the value as a
@@ -3758,10 +3774,11 @@ performance in the `Performance Guide`_.
uniform bool reduce_equal(int32 v, uniform int32 * uniform sameval)
uniform bool reduce_equal(unsigned int32 v,
uniform unsigned int32 * uniform sameval)
uniform bool reduce_equal(float v, uniform float * uniform sameval)
uniform bool reduce_equal(int64 v, uniform int64 * uniform sameval)
uniform bool reduce_equal(unsigned int64 v,
uniform unsigned int64 * uniform sameval)
uniform bool reduce_equal(float v, uniform float * uniform sameval)
uniform bool reduce_equal(double, uniform double * uniform sameval)
If called when none of the program instances are running,

View File

@@ -1162,19 +1162,20 @@ REDUCE_ADD(double, __vec16_d, __reduce_add_double)
REDUCE_MINMAX(double, __vec16_d, __reduce_min_double, <)
REDUCE_MINMAX(double, __vec16_d, __reduce_max_double, >)
REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_int32)
REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
REDUCE_ADD(int64_t, __vec16_i32, __reduce_add_int32)
REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_min_int32, <)
REDUCE_MINMAX(int32_t, __vec16_i32, __reduce_max_int32, >)
REDUCE_ADD(uint32_t, __vec16_i32, __reduce_add_uint32)
REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_min_uint32, <)
REDUCE_MINMAX(uint32_t, __vec16_i32, __reduce_max_uint32, >)
REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_int64)
REDUCE_ADD(int64_t, __vec16_i64, __reduce_add_int64)
REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_min_int64, <)
REDUCE_MINMAX(int64_t, __vec16_i64, __reduce_max_int64, >)
REDUCE_ADD(uint64_t, __vec16_i64, __reduce_add_uint64)
REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_min_uint64, <)
REDUCE_MINMAX(uint64_t, __vec16_i64, __reduce_max_uint64, >)

View File

@@ -1231,19 +1231,20 @@ REDUCE_ADD(double, __vec32_d, __reduce_add_double)
REDUCE_MINMAX(double, __vec32_d, __reduce_min_double, <)
REDUCE_MINMAX(double, __vec32_d, __reduce_max_double, >)
REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_int32)
REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
REDUCE_ADD(int64_t, __vec32_i32, __reduce_add_int32)
REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_min_int32, <)
REDUCE_MINMAX(int32_t, __vec32_i32, __reduce_max_int32, >)
REDUCE_ADD(uint32_t, __vec32_i32, __reduce_add_uint32)
REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_min_uint32, <)
REDUCE_MINMAX(uint32_t, __vec32_i32, __reduce_max_uint32, >)
REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_int64)
REDUCE_ADD(int64_t, __vec32_i64, __reduce_add_int64)
REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_min_int64, <)
REDUCE_MINMAX(int64_t, __vec32_i64, __reduce_max_int64, >)
REDUCE_ADD(uint64_t, __vec32_i64, __reduce_add_uint64)
REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_min_uint64, <)
REDUCE_MINMAX(uint64_t, __vec32_i64, __reduce_max_uint64, >)

View File

@@ -1364,19 +1364,20 @@ REDUCE_ADD(double, __vec64_d, __reduce_add_double)
REDUCE_MINMAX(double, __vec64_d, __reduce_min_double, <)
REDUCE_MINMAX(double, __vec64_d, __reduce_max_double, >)
REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_int32)
REDUCE_ADD(int16_t, __vec16_i8, __reduce_add_int8)
REDUCE_ADD(int32_t, __vec16_i16, __reduce_add_int16)
REDUCE_ADD(int64_t, __vec64_i32, __reduce_add_int32)
REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_min_int32, <)
REDUCE_MINMAX(int32_t, __vec64_i32, __reduce_max_int32, >)
REDUCE_ADD(uint32_t, __vec64_i32, __reduce_add_uint32)
REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_min_uint32, <)
REDUCE_MINMAX(uint32_t, __vec64_i32, __reduce_max_uint32, >)
REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_int64)
REDUCE_ADD(int64_t, __vec64_i64, __reduce_add_int64)
REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_min_int64, <)
REDUCE_MINMAX(int64_t, __vec64_i64, __reduce_max_int64, >)
REDUCE_ADD(uint64_t, __vec64_i64, __reduce_add_uint64)
REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_min_uint64, <)
REDUCE_MINMAX(uint64_t, __vec64_i64, __reduce_max_uint64, >)

View File

@@ -1511,6 +1511,22 @@ static FORCEINLINE int64_t __count_trailing_zeros_i64(const __vec1_i64 mask) {
// reductions
///////////////////////////////////////////////////////////////////////////
static FORCEINLINE int16_t __reduce_add_i8(__vec16_i8 v) {
// TODO: improve this!
int16_t ret = 0;
for (int i = 0; i < 16; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE int32_t __reduce_add_i16(__vec16_i16 v) {
// TODO: improve this!
int32_t ret = 0;
for (int i = 0; i < 16; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE uint32_t __reduce_add_i32(__vec16_i32 v) {
return _mm512_reduce_add_epi32(v);
}

View File

@@ -1607,6 +1607,9 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
///////////////////////////////////////////////////////////////////////////
// reductions
REDUCE_ADD(int16_t, __vec32_i8, __reduce_add_int8)
REDUCE_ADD(int32_t, __vec32_i16, __reduce_add_int16)
static FORCEINLINE float __reduce_add_float(__vec32_f v) {
return _mm512_reduce_add_ps(v.v1) + _mm512_reduce_add_ps(v.v2);
}

View File

@@ -2528,6 +2528,22 @@ static FORCEINLINE int64_t __count_leading_zeros_i64(uint64_t v) {
///////////////////////////////////////////////////////////////////////////
// reductions
static FORCEINLINE int16_t __reduce_add_int8(__vec4_i8 v) {
// TODO: improve
int16_t ret = 0;
for (int i = 0; i < 4; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE int32_t __reduce_add_int16(__vec4_i16 v) {
// TODO: improve
int32_t ret = 0;
for (int i = 0; i < 4; ++i)
ret += v.v[i];
return ret;
}
static FORCEINLINE float __reduce_add_float(__vec4_f v) {
float r = bits_as_float(_mm_extract_ps(v.v, 0));
r += bits_as_float(_mm_extract_ps(v.v, 1));

View File

@@ -887,13 +887,32 @@ static inline uniform double select(uniform bool c, uniform double a,
///////////////////////////////////////////////////////////////////////////
// Horizontal ops / reductions
__declspec(safe)
static inline uniform int16 reduce_add(int8 x) {
return __reduce_add_int8(__mask ? x : (int8)0);
}
__declspec(safe)
static inline uniform unsigned int16 reduce_add(unsigned int8 x) {
return __reduce_add_int8(__mask ? x : (int8)0);
}
__declspec(safe)
static inline uniform int32 reduce_add(int16 x) {
return __reduce_add_int16(__mask ? x : (int16)0);
}
__declspec(safe)
static inline uniform unsigned int32 reduce_add(unsigned int16 x) {
return __reduce_add_int16(__mask ? x : (int16)0);
}
__declspec(safe)
static inline uniform float reduce_add(float x) {
// zero the lanes where the mask is off
return __reduce_add_float(__mask ? x : 0.);
}
__declspec(safe)
static inline uniform float reduce_min(float v) {
// For the lanes where the mask is off, replace the given value with
@@ -915,7 +934,7 @@ static inline uniform float reduce_max(float v) {
}
__declspec(safe)
static inline uniform int reduce_add(int x) {
static inline uniform int64 reduce_add(int32 x) {
// Zero out the values for lanes that aren't running
return __reduce_add_int32(__mask ? x : 0);
}
@@ -937,7 +956,7 @@ static inline uniform int reduce_max(int v) {
}
__declspec(safe)
static inline uniform unsigned int reduce_add(unsigned int x) {
static inline uniform unsigned int64 reduce_add(unsigned int32 x) {
// Set values for non-running lanes to zero so they don't affect the
// result.
return __reduce_add_int32(__mask ? x : 0);

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float v = aFOO[programIndex];
uniform float m;
int16 iv = (int)v;
if (iv & 1)
m = reduce_add(iv);
RET[programIndex] = m;
}
export void result(uniform float RET[]) {
uniform int x = 0;
for (uniform int i = 1; i <= programCount; i += 2)
x += i;
RET[programIndex] = x;
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float v = aFOO[programIndex];
uniform float m;
int16 iv = (int)v;
/*CO if (iv & 1)*/
m = reduce_add(iv);
RET[programIndex] = m;
}
export void result(uniform float RET[]) {
uniform int x = 0;
for (uniform int i = 1; i <= programCount; ++i)
x += i;
RET[programIndex] = x;
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
float v = aFOO[programIndex];
uniform float m;
int8 iv = (int)v;
if (iv & 1)
m = reduce_add(iv);
RET[programIndex] = m;
}
export void result(uniform float RET[]) {
uniform int x = 0;
for (uniform int i = 1; i <= programCount; i += 2)
x += i;
RET[programIndex] = x;
}

View File

@@ -0,0 +1,18 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
int8 db = b-4;
int8 iv = programIndex + db;
int m = reduce_add(iv);
RET[programIndex] = m;
}
export void result(uniform float RET[]) {
uniform int x = 0;
for (uniform int i = 1; i <= programCount; ++i)
x += i;
RET[programIndex] = x;
}