added reduce_add/min/max
This commit is contained in:
@@ -79,22 +79,33 @@ define i64* @__cvt_const2gen(i64 addrspace(4)*) nounwind readnone alwaysinline
|
|||||||
%ptr = tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)* %0)
|
%ptr = tail call i64* @llvm.nvvm.ptr.shared.to.gen.p0i64.p4i64(i64 addrspace(4)* %0)
|
||||||
ret i64* %ptr
|
ret i64* %ptr
|
||||||
}
|
}
|
||||||
|
|
||||||
;;;;;;;;
|
;;;;;;;;
|
||||||
|
;; i32
|
||||||
define i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline
|
define i32 @__shfl_i32_nvptx(i32, i32) nounwind readnone alwaysinline
|
||||||
{
|
{
|
||||||
%shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
|
%shfl = tail call i32 asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
|
||||||
ret i32 %shfl
|
ret i32 %shfl
|
||||||
}
|
}
|
||||||
define float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline
|
|
||||||
{
|
|
||||||
%shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
|
|
||||||
ret float %shfl
|
|
||||||
}
|
|
||||||
define i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline
|
define i32 @__shfl_xor_i32_nvptx(i32, i32) nounwind readnone alwaysinline
|
||||||
{
|
{
|
||||||
%shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
|
%shfl = tail call i32 asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=r,r,r"(i32 %0, i32 %1) nounwind readnone alwaysinline
|
||||||
ret i32 %shfl
|
ret i32 %shfl
|
||||||
}
|
}
|
||||||
|
;; float
|
||||||
|
define float @__shfl_float_nvptx(float, i32) nounwind readnone alwaysinline
|
||||||
|
{
|
||||||
|
%shfl = tail call float asm sideeffect "shfl.idx.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
|
||||||
|
ret float %shfl
|
||||||
|
}
|
||||||
|
define float @__shfl_xor_float_nvptx(float, i32) nounwind readnone alwaysinline
|
||||||
|
{
|
||||||
|
%shfl = tail call float asm sideeffect "shfl.bfly.b32 $0, $1, $2, 0x1f;", "=f,f,r"(float %0, i32 %1) nounwind readnone alwaysinline
|
||||||
|
ret float %shfl
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;;;; min/max
|
||||||
|
;; float/double
|
||||||
define float @__fminf_nvptx(float,float) nounwind readnone alwaysinline
|
define float @__fminf_nvptx(float,float) nounwind readnone alwaysinline
|
||||||
{
|
{
|
||||||
%min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
|
%min = tail call float asm sideeffect "min.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
|
||||||
@@ -105,6 +116,88 @@ define float @__fmaxf_nvptx(float,float) nounwind readnone alwaysinline
|
|||||||
%max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
|
%max = tail call float asm sideeffect "max.f32 $0, $1, $2;", "=f,f,f"(float %0, float %1) nounwind readnone alwaysinline
|
||||||
ret float %max
|
ret float %max
|
||||||
}
|
}
|
||||||
|
|
||||||
|
;; int
|
||||||
|
define(`int_minmax',`
|
||||||
|
define $1 @__min_$1_signed($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = icmp slt $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
define $1 @__max_$1_signed($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = icmp sgt $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
define $1 @__min_$1_unsigned($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = icmp ult $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
define $1 @__max_$1_unsigned($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = icmp ugt $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
')
|
||||||
|
int_minmax(i8);
|
||||||
|
int_minmax(i16);
|
||||||
|
int_minmax(i32);
|
||||||
|
int_minmax(i64);
|
||||||
|
|
||||||
|
;; float/double
|
||||||
|
define(`fp_minmax',`
|
||||||
|
define $1 @__min_$1($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = fcmp olt $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
define $1 @__max_$1($1,$1) nounwind readnone alwaysinline {
|
||||||
|
%c = fcmp ogt $1 %0, %1
|
||||||
|
%r = select i1 %c, $1 %0, $1 %1
|
||||||
|
ret $1 %r
|
||||||
|
}
|
||||||
|
')
|
||||||
|
fp_minmax(float)
|
||||||
|
fp_minmax(double)
|
||||||
|
|
||||||
|
;;;;;;;;; __shfl/__shfl_xor intrinsics
|
||||||
|
;; i8/i16/i64
|
||||||
|
define(`shfl32',`
|
||||||
|
define $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
|
||||||
|
{
|
||||||
|
%ext = zext $2 %0 to i32
|
||||||
|
%res = tail call i32 @$1_i32_nvptx(i32 %ext, i32 %1)
|
||||||
|
%ret = trunc i32 %res to $2
|
||||||
|
ret $2 %ret
|
||||||
|
}
|
||||||
|
')
|
||||||
|
shfl32(__shfl, i8);
|
||||||
|
shfl32(__shfl_xor, i8);
|
||||||
|
shfl32(__shfl, i16);
|
||||||
|
shfl32(__shfl_xor, i16);
|
||||||
|
|
||||||
|
|
||||||
|
define(`shfl64',`
|
||||||
|
define $2 @$1_$2_nvptx($2, i32) nounwind readnone alwaysinline
|
||||||
|
{
|
||||||
|
%in = bitcast $2 %0 to <2 x i32>
|
||||||
|
%in0 = extractelement <2 x i32> %in, i32 0
|
||||||
|
%in1 = extractelement <2 x i32> %in, i32 1
|
||||||
|
%out0 = tail call i32 @$1_i32_nvptx(i32 %in0, i32 %1)
|
||||||
|
%out1 = tail call i32 @$1_i32_nvptx(i32 %in1, i32 %1)
|
||||||
|
%out2 = insertelement <2 x i32> undef, i32 %out0, i32 0
|
||||||
|
%out = insertelement <2 x i32> %out2, i32 %out1, i32 1
|
||||||
|
%ret = bitcast <2 x i32> %out to $2
|
||||||
|
ret $2 %ret
|
||||||
|
}
|
||||||
|
')
|
||||||
|
shfl64(__shfl, i64)
|
||||||
|
shfl64(__shfl_xor, i64)
|
||||||
|
shfl64(__shfl, double)
|
||||||
|
shfl64(__shfl_xor, double)
|
||||||
|
|
||||||
|
;;;;;;;;;;;;;
|
||||||
define i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline
|
define i32 @__ballot_nvptx(i1) nounwind readnone alwaysinline
|
||||||
{
|
{
|
||||||
%conv = zext i1 %0 to i32
|
%conv = zext i1 %0 to i32
|
||||||
@@ -650,9 +743,40 @@ define i1 @__none(<1 x i1>) nounwind readnone alwaysinline {
|
|||||||
ret i1 %cmp
|
ret i1 %cmp
|
||||||
}
|
}
|
||||||
|
|
||||||
declare i16 @__reduce_add_int8(<WIDTH x i8>) nounwind readnone
|
;;;;;;;;; reductions i8
|
||||||
declare i32 @__reduce_add_int16(<WIDTH x i16>) nounwind readnone
|
define i16 @__reduce_add_int8(<1 x i8> %v) nounwind readnone alwaysinline {
|
||||||
|
%value8 = extractelement <1 x i8> %v, i32 0
|
||||||
|
%value = zext i8 %value8 to i16
|
||||||
|
%call = tail call i16 @__shfl_xor_i16_nvptx(i16 %value, i32 16)
|
||||||
|
%call1 = add i16 %call, %value
|
||||||
|
%call.1 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1, i32 8)
|
||||||
|
%call1.1 = add i16 %call1, %call.1
|
||||||
|
%call.2 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.1, i32 4)
|
||||||
|
%call1.2 = add i16 %call1.1, %call.2
|
||||||
|
%call.3 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.2, i32 2)
|
||||||
|
%call1.3 = add i16 %call1.2, %call.3
|
||||||
|
%call.4 = tail call i16 @__shfl_xor_i16_nvptx(i16 %call1.3, i32 1)
|
||||||
|
%call1.4 = add i16 %call1.3, %call.4
|
||||||
|
ret i16 %call1.4
|
||||||
|
}
|
||||||
|
;;;;;;;;; reductions i16
|
||||||
|
define i32 @__reduce_add_int16(<1 x i16> %v) nounwind readnone alwaysinline {
|
||||||
|
%value16 = extractelement <1 x i16> %v, i32 0
|
||||||
|
%value = zext i16 %value16 to i32
|
||||||
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
|
%call1 = add i32 %call, %value
|
||||||
|
%call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
|
||||||
|
%call1.1 = add i32 %call1, %call.1
|
||||||
|
%call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
|
||||||
|
%call1.2 = add i32 %call1.1, %call.2
|
||||||
|
%call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
|
||||||
|
%call1.3 = add i32 %call1.2, %call.3
|
||||||
|
%call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
|
||||||
|
%call1.4 = add i32 %call1.3, %call.4
|
||||||
|
ret i32 %call1.4
|
||||||
|
}
|
||||||
|
|
||||||
|
;;;;;;;;; reductions float
|
||||||
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline {
|
||||||
%value = extractelement <1 x float> %v, i32 0
|
%value = extractelement <1 x float> %v, i32 0
|
||||||
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
||||||
@@ -667,8 +791,7 @@ define float @__reduce_add_float(<1 x float> %v) nounwind readonly alwaysinline
|
|||||||
%call1.4 = fadd float %call1.3, %call.4
|
%call1.4 = fadd float %call1.3, %call.4
|
||||||
ret float %call1.4
|
ret float %call1.4
|
||||||
}
|
}
|
||||||
|
define float @__reduce_min_float(<1 x float>) nounwind readnone alwaysinline {
|
||||||
define float @__reduce_min_float(<1 x float>) nounwind readnone {
|
|
||||||
%value = extractelement <1 x float> %0, i32 0
|
%value = extractelement <1 x float> %0, i32 0
|
||||||
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
||||||
%call1 = tail call float @__fminf_nvptx(float %value, float %call)
|
%call1 = tail call float @__fminf_nvptx(float %value, float %call)
|
||||||
@@ -682,9 +805,7 @@ define float @__reduce_min_float(<1 x float>) nounwind readnone {
|
|||||||
%call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4)
|
%call1.4 = tail call float @__fminf_nvptx(float %call1.3, float %call.4)
|
||||||
ret float %call1.4
|
ret float %call1.4
|
||||||
}
|
}
|
||||||
|
define float @__reduce_max_float(<1 x float>) nounwind readnone alwaysinline {
|
||||||
define float @__reduce_max_float(<1 x float>) nounwind readnone
|
|
||||||
{
|
|
||||||
%value = extractelement <1 x float> %0, i32 0
|
%value = extractelement <1 x float> %0, i32 0
|
||||||
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
%call = tail call float @__shfl_xor_float_nvptx(float %value, i32 16)
|
||||||
%call1 = tail call float @__fmaxf_nvptx(float %value, float %call)
|
%call1 = tail call float @__fmaxf_nvptx(float %value, float %call)
|
||||||
@@ -699,8 +820,8 @@ define float @__reduce_max_float(<1 x float>) nounwind readnone
|
|||||||
ret float %call1.4
|
ret float %call1.4
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_add_int32(<1 x i32>) nounwind readnone
|
;;;;;;;;; reductions int32
|
||||||
{
|
define i32 @__reduce_add_int32(<1 x i32>) nounwind readnone alwaysinline {
|
||||||
%value = extractelement <1 x i32> %0, i32 0
|
%value = extractelement <1 x i32> %0, i32 0
|
||||||
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
%call1 = add i32 %call, %value
|
%call1 = add i32 %call, %value
|
||||||
@@ -714,68 +835,183 @@ define i32 @__reduce_add_int32(<1 x i32>) nounwind readnone
|
|||||||
%call1.4 = add i32 %call1.3, %call.4
|
%call1.4 = add i32 %call1.3, %call.4
|
||||||
ret i32 %call1.4
|
ret i32 %call1.4
|
||||||
}
|
}
|
||||||
|
define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone alwaysinline {
|
||||||
define i32 @__reduce_min_int32(<1 x i32>) nounwind readnone {
|
%value = extractelement <1 x i32> %0, i32 0
|
||||||
%r = extractelement <1 x i32> %0, i32 0
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
ret i32 %r
|
%call1 = tail call i32 @__min_i32_signed(i32 %value, i32 %call)
|
||||||
|
%call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i32 @__min_i32_signed(i32 %call1, i32 %call.1)
|
||||||
|
%call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i32 @__min_i32_signed(i32 %call1.1, i32 %call.2)
|
||||||
|
%call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i32 @__min_i32_signed(i32 %call1.2, i32 %call.3)
|
||||||
|
%call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i32 @__min_i32_signed(i32 %call1.3, i32 %call.4)
|
||||||
|
ret i32 %call1.4
|
||||||
|
}
|
||||||
|
define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone alwaysinline {
|
||||||
|
%value = extractelement <1 x i32> %0, i32 0
|
||||||
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
|
%call1 = tail call i32 @__max_i32_signed(i32 %value, i32 %call)
|
||||||
|
%call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i32 @__max_i32_signed(i32 %call1, i32 %call.1)
|
||||||
|
%call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i32 @__max_i32_signed(i32 %call1.1, i32 %call.2)
|
||||||
|
%call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i32 @__max_i32_signed(i32 %call1.2, i32 %call.3)
|
||||||
|
%call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i32 @__max_i32_signed(i32 %call1.3, i32 %call.4)
|
||||||
|
ret i32 %call1.4
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_max_int32(<1 x i32>) nounwind readnone {
|
;;;;;;;;; reductions uint32
|
||||||
%r = extractelement <1 x i32> %0, i32 0
|
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone alwaysinline {
|
||||||
ret i32 %r
|
%value = extractelement <1 x i32> %0, i32 0
|
||||||
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
|
%call1 = tail call i32 @__min_i32_unsigned(i32 %value, i32 %call)
|
||||||
|
%call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i32 @__min_i32_unsigned(i32 %call1, i32 %call.1)
|
||||||
|
%call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i32 @__min_i32_unsigned(i32 %call1.1, i32 %call.2)
|
||||||
|
%call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i32 @__min_i32_unsigned(i32 %call1.2, i32 %call.3)
|
||||||
|
%call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i32 @__min_i32_unsigned(i32 %call1.3, i32 %call.4)
|
||||||
|
ret i32 %call1.4
|
||||||
|
}
|
||||||
|
define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone alwaysinline {
|
||||||
|
%value = extractelement <1 x i32> %0, i32 0
|
||||||
|
%call = tail call i32 @__shfl_xor_i32_nvptx(i32 %value, i32 16)
|
||||||
|
%call1 = tail call i32 @__max_i32_unsigned(i32 %value, i32 %call)
|
||||||
|
%call.1 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i32 @__max_i32_unsigned(i32 %call1, i32 %call.1)
|
||||||
|
%call.2 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i32 @__max_i32_unsigned(i32 %call1.1, i32 %call.2)
|
||||||
|
%call.3 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i32 @__max_i32_unsigned(i32 %call1.2, i32 %call.3)
|
||||||
|
%call.4 = tail call i32 @__shfl_xor_i32_nvptx(i32 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i32 @__max_i32_unsigned(i32 %call1.3, i32 %call.4)
|
||||||
|
ret i32 %call1.4
|
||||||
}
|
}
|
||||||
|
|
||||||
define i32 @__reduce_min_uint32(<1 x i32>) nounwind readnone {
|
;;;;;;;;; reductions double
|
||||||
%r = extractelement <1 x i32> %0, i32 0
|
define double @__reduce_add_double(<1 x double>) nounwind readnone alwaysinline {
|
||||||
ret i32 %r
|
%value = extractelement <1 x double> %0, i32 0
|
||||||
|
%call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
|
||||||
|
%call1 = fadd double %call, %value
|
||||||
|
%call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
|
||||||
|
%call1.1 = fadd double %call1, %call.1
|
||||||
|
%call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
|
||||||
|
%call1.2 = fadd double %call1.1, %call.2
|
||||||
|
%call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
|
||||||
|
%call1.3 = fadd double %call1.2, %call.3
|
||||||
|
%call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
|
||||||
|
%call1.4 = fadd double %call1.3, %call.4
|
||||||
|
ret double %call1.4
|
||||||
}
|
}
|
||||||
|
define double @__reduce_min_double(<1 x double>) nounwind readnone alwaysinline {
|
||||||
define i32 @__reduce_max_uint32(<1 x i32>) nounwind readnone {
|
%value = extractelement <1 x double> %0, i32 0
|
||||||
%r = extractelement <1 x i32> %0, i32 0
|
%call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
|
||||||
ret i32 %r
|
%call1 = tail call double @__min_double(double %value, double %call)
|
||||||
|
%call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
|
||||||
|
%call1.1 = tail call double @__min_double(double %call1, double %call.1)
|
||||||
|
%call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call double @__min_double(double %call1.1, double %call.2)
|
||||||
|
%call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call double @__min_double(double %call1.2, double %call.3)
|
||||||
|
%call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call double @__min_double(double %call1.3, double %call.4)
|
||||||
|
ret double %call1.4
|
||||||
|
}
|
||||||
|
define double @__reduce_max_double(<1 x double>) nounwind readnone alwaysinline {
|
||||||
|
%value = extractelement <1 x double> %0, i32 0
|
||||||
|
%call = tail call double @__shfl_xor_double_nvptx(double %value, i32 16)
|
||||||
|
%call1 = tail call double @__max_double(double %value, double %call)
|
||||||
|
%call.1 = tail call double @__shfl_xor_double_nvptx(double %call1, i32 8)
|
||||||
|
%call1.1 = tail call double @__max_double(double %call1, double %call.1)
|
||||||
|
%call.2 = tail call double @__shfl_xor_double_nvptx(double %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call double @__max_double(double %call1.1, double %call.2)
|
||||||
|
%call.3 = tail call double @__shfl_xor_double_nvptx(double %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call double @__max_double(double %call1.2, double %call.3)
|
||||||
|
%call.4 = tail call double @__shfl_xor_double_nvptx(double %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call double @__max_double(double %call1.3, double %call.4)
|
||||||
|
ret double %call1.4
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
define double @__reduce_add_double(<1 x double>) nounwind readnone {
|
;;;;;;;;; reductions int64
|
||||||
%m = extractelement <1 x double> %0, i32 0
|
define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone alwaysinline {
|
||||||
ret double %m
|
%value = extractelement <1 x i64> %0, i32 0
|
||||||
}
|
%call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
|
||||||
|
%call1 = add i64 %call, %value
|
||||||
define double @__reduce_min_double(<1 x double>) nounwind readnone {
|
%call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
|
||||||
%m = extractelement <1 x double> %0, i32 0
|
%call1.1 =add i64 %call1, %call.1
|
||||||
ret double %m
|
%call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
|
||||||
}
|
%call1.2 = add i64 %call1.1, %call.2
|
||||||
|
%call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
|
||||||
define double @__reduce_max_double(<1 x double>) nounwind readnone {
|
%call1.3 = add i64 %call1.2, %call.3
|
||||||
%m = extractelement <1 x double> %0, i32 0
|
%call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
|
||||||
ret double %m
|
%call1.4 = add i64 %call1.3, %call.4
|
||||||
}
|
ret i64 %call1.4
|
||||||
|
}
|
||||||
define i64 @__reduce_add_int64(<1 x i64>) nounwind readnone {
|
define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone alwaysinline {
|
||||||
%m = extractelement <1 x i64> %0, i32 0
|
%value = extractelement <1 x i64> %0, i32 0
|
||||||
ret i64 %m
|
%call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
|
||||||
}
|
%call1 = tail call i64 @__min_i64_signed(i64 %value, i64 %call)
|
||||||
|
%call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
|
||||||
define i64 @__reduce_min_int64(<1 x i64>) nounwind readnone {
|
%call1.1 = tail call i64 @__min_i64_signed(i64 %call1, i64 %call.1)
|
||||||
%m = extractelement <1 x i64> %0, i32 0
|
%call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
|
||||||
ret i64 %m
|
%call1.2 = tail call i64 @__min_i64_signed(i64 %call1.1, i64 %call.2)
|
||||||
}
|
%call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i64 @__min_i64_signed(i64 %call1.2, i64 %call.3)
|
||||||
define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone {
|
%call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
|
||||||
%m = extractelement <1 x i64> %0, i32 0
|
%call1.4 = tail call i64 @__min_i64_signed(i64 %call1.3, i64 %call.4)
|
||||||
ret i64 %m
|
ret i64 %call1.4
|
||||||
}
|
}
|
||||||
|
define i64 @__reduce_max_int64(<1 x i64>) nounwind readnone alwaysinline {
|
||||||
define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone {
|
%value = extractelement <1 x i64> %0, i32 0
|
||||||
%m = extractelement <1 x i64> %0, i32 0
|
%call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
|
||||||
ret i64 %m
|
%call1 = tail call i64 @__max_i64_signed(i64 %value, i64 %call)
|
||||||
}
|
%call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i64 @__max_i64_signed(i64 %call1, i64 %call.1)
|
||||||
define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone {
|
%call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
|
||||||
%m = extractelement <1 x i64> %0, i32 0
|
%call1.2 = tail call i64 @__max_i64_signed(i64 %call1.1, i64 %call.2)
|
||||||
ret i64 %m
|
%call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i64 @__max_i64_signed(i64 %call1.2, i64 %call.3)
|
||||||
|
%call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i64 @__max_i64_signed(i64 %call1.3, i64 %call.4)
|
||||||
|
ret i64 %call1.4
|
||||||
|
}
|
||||||
|
define i64 @__reduce_min_uint64(<1 x i64>) nounwind readnone alwaysinline {
|
||||||
|
%value = extractelement <1 x i64> %0, i32 0
|
||||||
|
%call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
|
||||||
|
%call1 = tail call i64 @__min_i64_unsigned(i64 %value, i64 %call)
|
||||||
|
%call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i64 @__min_i64_unsigned(i64 %call1, i64 %call.1)
|
||||||
|
%call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i64 @__min_i64_unsigned(i64 %call1.1, i64 %call.2)
|
||||||
|
%call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i64 @__min_i64_unsigned(i64 %call1.2, i64 %call.3)
|
||||||
|
%call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i64 @__min_i64_unsigned(i64 %call1.3, i64 %call.4)
|
||||||
|
ret i64 %call1.4
|
||||||
|
}
|
||||||
|
define i64 @__reduce_max_uint64(<1 x i64>) nounwind readnone alwaysinline {
|
||||||
|
%value = extractelement <1 x i64> %0, i32 0
|
||||||
|
%call = tail call i64 @__shfl_xor_i64_nvptx(i64 %value, i32 16)
|
||||||
|
%call1 = tail call i64 @__max_i64_unsigned(i64 %value, i64 %call)
|
||||||
|
%call.1 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1, i32 8)
|
||||||
|
%call1.1 = tail call i64 @__max_i64_unsigned(i64 %call1, i64 %call.1)
|
||||||
|
%call.2 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.1, i32 4)
|
||||||
|
%call1.2 = tail call i64 @__max_i64_unsigned(i64 %call1.1, i64 %call.2)
|
||||||
|
%call.3 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.2, i32 2)
|
||||||
|
%call1.3 = tail call i64 @__max_i64_unsigned(i64 %call1.2, i64 %call.3)
|
||||||
|
%call.4 = tail call i64 @__shfl_xor_i64_nvptx(i64 %call1.3, i32 1)
|
||||||
|
%call1.4 = tail call i64 @__max_i64_unsigned(i64 %call1.3, i64 %call.4)
|
||||||
|
ret i64 %call1.4
|
||||||
}
|
}
|
||||||
|
|
||||||
|
;;;; reduce equal
|
||||||
define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
|
define i1 @__reduce_equal_int32(<1 x i32> %vv, i32 * %samevalue,
|
||||||
<1 x i1> %mask) nounwind alwaysinline {
|
<1 x i1> %mask) nounwind alwaysinline {
|
||||||
%v=extractelement <1 x i32> %vv, i32 0
|
%v=extractelement <1 x i32> %vv, i32 0
|
||||||
|
|||||||
Reference in New Issue
Block a user