This commit is contained in:
Jean-Luc Duprat
2012-07-17 11:46:30 -07:00
82 changed files with 3708 additions and 885 deletions

View File

@@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) {
"__prefetch_read_uniform_nt",
"__rcp_uniform_float",
"__rcp_varying_float",
"__rdrand_i16",
"__rdrand_i32",
"__rdrand_i64",
"__reduce_add_double",
"__reduce_add_float",
"__reduce_add_int32",

View File

@@ -76,18 +76,19 @@ declare void @abort() noreturn
;; /* NOTE: the values returned below must be the same as the
;; corresponding enumerant values in Target::ISA. */
;; if ((info[2] & (1 << 28)) != 0) {
;; // AVX1 for sure. Do we have AVX2?
;; // Call cpuid with eax=7, ecx=0
;; __cpuid_count(info, 7, 0);
;; if ((info[1] & (1 << 5)) != 0)
;; return 4; // AVX2
;; else {
;; if ((info[2] & (1 << 29)) != 0 && // F16C
;; (info[2] & (1 << 30)) != 0) // RDRAND
;; return 3; // AVX1 on IVB
;; else
;; return 2; // AVX1
;; }
;; if ((info[2] & (1 << 29)) != 0 && // F16C
;; (info[2] & (1 << 30)) != 0) { // RDRAND
;; // So far, so good. AVX2?
;; // Call cpuid with eax=7, ecx=0
;; int info2[4];
;; __cpuid_count(info2, 7, 0);
;; if ((info2[1] & (1 << 5)) != 0)
;; return 4;
;; else
;; return 3;
;; }
;; // Regular AVX
;; return 2;
;; }
;; else if ((info[2] & (1 << 19)) != 0)
;; return 1; // SSE4
@@ -104,40 +105,37 @@ entry:
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
%and = and i32 %asmresult5.i, 268435456
%cmp = icmp eq i32 %and, 0
br i1 %cmp, label %if.else14, label %if.then
br i1 %cmp, label %if.else13, label %if.then
if.then: ; preds = %entry
%1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1
%and3 = and i32 %asmresult4.i29, 32
%cmp4 = icmp eq i32 %and3, 0
br i1 %cmp4, label %if.else, label %return
%1 = and i32 %asmresult5.i, 1610612736
%2 = icmp eq i32 %1, 1610612736
br i1 %2, label %if.then7, label %return
if.else: ; preds = %if.then
%asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2
%2 = and i32 %asmresult5.i30, 1610612736
%3 = icmp eq i32 %2, 1610612736
br i1 %3, label %return, label %if.else13
if.else13: ; preds = %if.else
if.then7: ; preds = %if.then
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
%and10 = lshr i32 %asmresult4.i28, 5
%4 = and i32 %and10, 1
%5 = add i32 %4, 3
br label %return
if.else14: ; preds = %entry
%and16 = and i32 %asmresult5.i, 524288
%cmp17 = icmp eq i32 %and16, 0
br i1 %cmp17, label %if.else19, label %return
if.else13: ; preds = %entry
%and15 = and i32 %asmresult5.i, 524288
%cmp16 = icmp eq i32 %and15, 0
br i1 %cmp16, label %if.else18, label %return
if.else19: ; preds = %if.else14
%and21 = and i32 %asmresult6.i, 67108864
%cmp22 = icmp eq i32 %and21, 0
br i1 %cmp22, label %if.else24, label %return
if.else18: ; preds = %if.else13
%and20 = and i32 %asmresult6.i, 67108864
%cmp21 = icmp eq i32 %and20, 0
br i1 %cmp21, label %if.else23, label %return
if.else24: ; preds = %if.else19
if.else23: ; preds = %if.else18
tail call void @abort() noreturn nounwind
unreachable
return: ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then
%retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ]
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
ret i32 %retval.0
}

View File

@@ -31,6 +31,8 @@
include(`target-avx-x2.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

View File

@@ -31,6 +31,8 @@
include(`target-avx.ll')
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)

View File

@@ -29,13 +29,55 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`NO_HALF_DECLARES', `1')
include(`target-avx-x2.ll')
include(`target-avx1-x2.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <16 x i32> %ret
}
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <16 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -86,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
'
)

View File

@@ -29,13 +29,55 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
define(`NO_HALF_DECLARES', `1')
include(`target-avx.ll')
include(`target-avx1.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; unsigned int min/max
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
ret <8 x i32> %ret
}
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
ret <8 x i32> %ret
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -70,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ifelse(LLVM_VERSION, `LLVM_3_0', `',
LLVM_VERSION, `LLVM_3_1', `',
`define(`HAVE_GATHER', `1')')
include(`target-avx-x2.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
declare void @llvm.trap() noreturn nounwind
; $1: type
; $2: var base name
define(`extract_4s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
')
; $1: type
; $2: var base name
define(`extract_8s', `
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
define(`assemble_8s', `
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
')
; $1: element type
; $2: ret name
; $3: v1
; $4: v2
; $5: v3
; $6: v4
define(`assemble_4s', `
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
assemble_8s($1, $2, $2_1, $2_2)
')
ifelse(LLVM_VERSION, `LLVM_3_0', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)',
LLVM_VERSION, `LLVM_3_1', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)', `
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 gathers
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_8s(i32, offsets)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_4s(i32, vecmask)
extract_4s(i64, offsets)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_8s(i32, ptrs)
extract_8s(i32, vecmask)
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
assemble_8s(i32, v, v1, v2)
ret <16 x i32> %v
}
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
extract_4s(i64, ptrs)
extract_4s(i32, vecmask)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
assemble_4s(i32, v, v1, v2, v3, v4)
ret <16 x i32> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float gathers
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(i32, offsets)
extract_8s(float, mask)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, offsets)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_8s(float, mask)
extract_8s(i32, ptrs)
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
assemble_8s(float, v, v1, v2)
ret <16 x float> %v
}
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
<16 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <16 x i32> %vecmask to <16 x float>
extract_4s(i64, ptrs)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
assemble_4s(float, v, v1, v2, v3, v4)
ret <16 x float> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64 gathers
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i32, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
extract_4s(i64, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
assemble_4s(i64, v, v1, v2, v3, v4)
ret <16 x i64> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double gathers
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
i32 %scale, <16 x i32> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
i32 %scale, <16 x i64> %offsets,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i32, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
<16 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
extract_4s(i64, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
assemble_4s(double, v, v1, v2, v3, v4)
ret <16 x double> %v
}
')

View File

@@ -1,4 +1,4 @@
;; Copyright (c) 2010-2011, Intel Corporation
;; Copyright (c) 2010-2012, Intel Corporation
;; All rights reserved.
;;
;; Redistribution and use in source and binary forms, with or without
@@ -29,8 +29,16 @@
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ifelse(LLVM_VERSION, `LLVM_3_0', `',
LLVM_VERSION, `LLVM_3_1', `',
`define(`HAVE_GATHER', `1')')
include(`target-avx.ll')
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
`rdrand_definition()')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int min/max
@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float/half conversions
ifelse(LLVM_VERSION, `LLVM_3_0', `
;; nothing to define...
', `
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
; 0 is round nearest even
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
@@ -100,13 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
%r = extractelement <8 x i16> %rv, i32 0
ret i16 %r
}
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather
declare void @llvm.trap() noreturn nounwind
define(`extract_4s', `
%$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
')
ifelse(LLVM_VERSION, `LLVM_3_0', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)',
LLVM_VERSION, `LLVM_3_1', `
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)', `
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int32 gathers
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
<8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
ret <8 x i32> %v
}
define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
extract_4s(i32, vecmask)
extract_4s(i64, offsets)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %v
}
define <8 x i32> @__gather32_i32(<8 x i32> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
<8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
ret <8 x i32> %v
}
define <8 x i32> @__gather64_i32(<8 x i64> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
extract_4s(i64, ptrs)
extract_4s(i32, vecmask)
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; float gathers
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <8 x i32> %vecmask to <8 x float>
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
<8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
ret <8 x float> %v
}
define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%mask = bitcast <8 x i32> %vecmask to <8 x float>
extract_4s(i64, offsets)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %v
}
define <8 x float> @__gather32_float(<8 x i32> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <8 x i32> %vecmask to <8 x float>
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
<8 x i32> %ptrs, <8 x float> %mask, i8 1)
ret <8 x float> %v
}
define <8 x float> @__gather64_float(<8 x i64> %ptrs,
<8 x i32> %vecmask) nounwind readonly alwaysinline {
%mask = bitcast <8 x i32> %vecmask to <8 x float>
extract_4s(i64, ptrs)
extract_4s(float, mask)
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; int64 gathers
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i32, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i64, offsets)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather32_i64(<8 x i32> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i32, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
define <8 x i64> @__gather64_i64(<8 x i64> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
extract_4s(i64, ptrs)
extract_4s(i64, vecmask)
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x i64> %v
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; double gathers
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
i32 %scale, <8 x i32> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i32, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
i32 %scale, <8 x i64> %offsets,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%scale8 = trunc i32 %scale to i8
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i64, offsets)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather32_double(<8 x i32> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i32, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
define <8 x double> @__gather64_double(<8 x i64> %ptrs,
<8 x i32> %mask32) nounwind readonly alwaysinline {
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
extract_4s(i64, ptrs)
extract_4s(double, vecmask)
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x double> %v
}
')

12
builtins/target-generic-1.ll Executable file → Normal file
View File

@@ -34,12 +34,12 @@ masked_load(double, 8)
; define these with the macros from stdlib.m4
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)

View File

@@ -32,11 +32,15 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
define(`MASK',`i1')
define(`HAVE_GATHER',`1')
define(`HAVE_SCATTER',`1')
include(`util.m4')
stdlib_core()
scans()
reduce_equal(WIDTH)
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; broadcast/rotate/shuffle
@@ -334,19 +338,19 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
;; gather/scatter
define(`gather_scatter', `
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
<WIDTH x i1>) nounwind readonly
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
<WIDTH x i1>) nounwind readonly
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
<WIDTH x $1>, <WIDTH x i1>) nounwind
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
<WIDTH x i1>) nounwind
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,

View File

@@ -33,6 +33,7 @@ ctlztz()
define_prefetches()
define_shuffles()
aossoa()
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rcp

View File

@@ -444,12 +444,12 @@ masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)

View File

@@ -575,12 +575,12 @@ masked_load(double, 8)
; define these with the macros from stdlib.m4
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)

View File

@@ -33,6 +33,7 @@ ctlztz()
define_prefetches()
define_shuffles()
aossoa()
rdrand_decls()
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rounding floats

View File

@@ -371,12 +371,12 @@ masked_load(double, 8)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; gather/scatter
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)

View File

@@ -474,12 +474,12 @@ masked_load(double, 8)
; define these with the macros from stdlib.m4
gen_gather(i8)
gen_gather(i16)
gen_gather(i32)
gen_gather(float)
gen_gather(i64)
gen_gather(double)
gen_gather_factored(i8)
gen_gather_factored(i16)
gen_gather_factored(i32)
gen_gather_factored(float)
gen_gather_factored(i64)
gen_gather_factored(double)
gen_scatter(i8)
gen_scatter(i16)

View File

@@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
; Declare the pseudo-gather functions. When the ispc front-end needs
; to perform a gather, it generates a call to one of these functions,
; which have signatures:
; which ideally have these signatures:
;
; varying int8 __pseudo_gather_i8(varying int8 *, mask)
; varying int16 __pseudo_gather_i16(varying int16 *, mask)
@@ -1588,24 +1588,9 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
; varying int64 __pseudo_gather_i64(varying int64 *, mask)
; varying double __pseudo_gather_double(varying double *, mask)
;
; The GatherScatterFlattenOpt optimization pass finds these calls and then
; converts them to make calls to the following functions (when appropriate);
; these represent gathers from a common base pointer with offsets. The
; offset_scale factor scales the offsets before they are added to the base
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
; offset_delta feeds into the free offset calculation.
;
; varying int{8,16,32,float,64,double}
; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
; int{32,64} offsets, uniform int32 offset_scale,
; int{32,64} offset_delta, mask)
;
; Then, the GSImprovementsPass optimizations finds these and either
; converts them to native gather functions or converts them to vector
; loads, if equivalent.
; However, vectors of pointers weren not legal in LLVM until recently, so
; instead, it emits calls to functions that either take vectors of int32s
; or int64s, depending on the compilation target.
declare <WIDTH x i8> @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
@@ -1621,31 +1606,106 @@ declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>)
declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
; The ImproveMemoryOps optimization pass finds these calls and then
; tries to convert them to be calls to gather functions that take a uniform
; base pointer and then a varying integer offset, when possible.
;
; For targets without a native gather instruction, it is best to factor the
; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
; where varying_offset includes non-compile time constant values, and
; constant_offset includes compile-time constant values. (The scalar loads
; generated in turn can then take advantage of the free offsetting and scale by
; 1/2/4/8 that is offered by the x86 addresisng modes.)
;
; varying int{8,16,32,float,64,double}
; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
; int{32,64} offsets, uniform int32 offset_scale,
; int{32,64} offset_delta, mask)
;
; For targets with a gather instruction, it is better to just factor them into
; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
; offsets are int32/64 vectors.
;
; varying int{8,16,32,float,64,double}
; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
; uniform int32 offset_scale, int{32,64} offsets, mask)
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8>
@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16>
@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32>
@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float>
@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64>
@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double>
@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8>
@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16>
@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32>
@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float>
@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64>
@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double>
@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8>
@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16>
@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32>
@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float>
@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64>
@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double>
@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i8>
@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i16>
@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i32>
@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x float>
@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x i64>
@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
declare <WIDTH x double>
@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
<WIDTH x MASK>) nounwind readonly
; Similarly to the pseudo-gathers defined above, we also declare undefined
; pseudo-scatter instructions with signatures:
@@ -1657,16 +1717,6 @@ declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i
; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
; void __pseudo_scatter_double(varying double *, varying double values, mask)
;
; The GatherScatterFlattenOpt optimization pass also finds these and
; transforms them to scatters like:
;
; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base,
; varying int32 offsets, uniform int32 offset_scale,
; varying int{32,64} offset_delta, varying int8 values, mask)
; (and similarly for 16/32/64 bit values)
;
; And the GSImprovementsPass in turn converts these to actual native
; scatters or masked stores.
declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
@@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x
declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
; And the ImproveMemoryOps optimization pass also finds these and
; either transforms them to scatters like:
;
; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base,
; varying int32 offsets, uniform int32 offset_scale,
; varying int{32,64} offset_delta, varying int8 values, mask)
; (and similarly for 16/32/64 bit values)
;
; Or, if the target has a native scatter instruction:
;
; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base,
; uniform int32 offset_scale, varying int{32,64} offsets,
; varying int8 values, mask)
; (and similarly for 16/32/64 bit values)
declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i8>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i16>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i32>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x float>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x i64>, <WIDTH x MASK>) nounwind
declare void
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
<WIDTH x double>, <WIDTH x MASK>) nounwind
declare float @__log_uniform_float(float) nounwind readnone
declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
@@ -1834,143 +1949,246 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
call void @__usedouble(<WIDTH x double> %pg64_d)
%g32_8 = call <WIDTH x i8> @__gather32_i8(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %g32_8)
%g32_16 = call <WIDTH x i16> @__gather32_i16(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %g32_16)
%g32_32 = call <WIDTH x i32> @__gather32_i32(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %g32_32)
%g32_f = call <WIDTH x float> @__gather32_float(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %g32_f)
%g32_64 = call <WIDTH x i64> @__gather32_i64(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %g32_64)
%g32_d = call <WIDTH x double> @__gather32_double(<WIDTH x i32> %v32,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %g32_d)
%g64_8 = call <WIDTH x i8> @__gather64_i8(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %g64_8)
%g64_16 = call <WIDTH x i16> @__gather64_i16(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %g64_16)
%g64_32 = call <WIDTH x i32> @__gather64_i32(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %g64_32)
%g64_f = call <WIDTH x float> @__gather64_float(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %g64_f)
%g64_64 = call <WIDTH x i64> @__gather64_i64(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %g64_64)
%g64_d = call <WIDTH x double> @__gather64_double(<WIDTH x i64> %v64,
<WIDTH x MASK> %mask)
<WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %g64_d)
ifelse(HAVE_GATHER, `1',
`
%nfpgbo32_8 = call <WIDTH x i8>
@__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %nfpgbo32_8)
%nfpgbo32_16 = call <WIDTH x i16>
@__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %nfpgbo32_16)
%nfpgbo32_32 = call <WIDTH x i32>
@__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %nfpgbo32_32)
%nfpgbo32_f = call <WIDTH x float>
@__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
%nfpgbo32_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %nfpgbo32_64)
%nfpgbo32_d = call <WIDTH x double>
@__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
%nfpgbo64_8 = call <WIDTH x i8>
@__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %nfpgbo64_8)
%nfpgbo64_16 = call <WIDTH x i16>
@__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %nfpgbo64_16)
%nfpgbo64_32 = call <WIDTH x i32>
@__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %nfpgbo64_32)
%nfpgbo64_f = call <WIDTH x float>
@__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
%nfpgbo64_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %nfpgbo64_64)
%nfpgbo64_d = call <WIDTH x double>
@__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
%nfgbo32_8 = call <WIDTH x i8>
@__gather_base_offsets32_i8(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %nfgbo32_8)
%nfgbo32_16 = call <WIDTH x i16>
@__gather_base_offsets32_i16(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %nfgbo32_16)
%nfgbo32_32 = call <WIDTH x i32>
@__gather_base_offsets32_i32(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %nfgbo32_32)
%nfgbo32_f = call <WIDTH x float>
@__gather_base_offsets32_float(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %nfgbo32_f)
%nfgbo32_64 = call <WIDTH x i64>
@__gather_base_offsets32_i64(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %nfgbo32_64)
%nfgbo32_d = call <WIDTH x double>
@__gather_base_offsets32_double(i8 * %ptr, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %nfgbo32_d)
%nfgbo64_8 = call <WIDTH x i8>
@__gather_base_offsets64_i8(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %nfgbo64_8)
%nfgbo64_16 = call <WIDTH x i16>
@__gather_base_offsets64_i16(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %nfgbo64_16)
%nfgbo64_32 = call <WIDTH x i32>
@__gather_base_offsets64_i32(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %nfgbo64_32)
%nfgbo64_f = call <WIDTH x float>
@__gather_base_offsets64_float(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %nfgbo64_f)
%nfgbo64_64 = call <WIDTH x i64>
@__gather_base_offsets64_i64(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %nfgbo64_64)
%nfgbo64_d = call <WIDTH x double>
@__gather_base_offsets64_double(i8 * %ptr, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %nfgbo64_d)
',
`
%pgbo32_8 = call <WIDTH x i8>
@__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %pgbo32_8)
%pgbo32_16 = call <WIDTH x i16>
@__pseudo_gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %pgbo32_16)
%pgbo32_32 = call <WIDTH x i32>
@__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pgbo32_32)
%pgbo32_f = call <WIDTH x float>
@__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pgbo32_f)
%pgbo32_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pgbo32_64)
%pgbo32_d = call <WIDTH x double>
@__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
@__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pgbo32_d)
%gbo32_8 = call <WIDTH x i8>
@__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %gbo32_8)
%gbo32_16 = call <WIDTH x i16>
@__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %gbo32_16)
%gbo32_32 = call <WIDTH x i32>
@__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %gbo32_32)
%gbo32_f = call <WIDTH x float>
@__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %gbo32_f)
%gbo32_64 = call <WIDTH x i64>
@__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %gbo32_64)
%gbo32_d = call <WIDTH x double>
@__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %gbo32_d)
%pgbo64_8 = call <WIDTH x i8>
@__pseudo_gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %pgbo64_8)
%pgbo64_16 = call <WIDTH x i16>
@__pseudo_gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %pgbo64_16)
%pgbo64_32 = call <WIDTH x i32>
@__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %pgbo64_32)
%pgbo64_f = call <WIDTH x float>
@__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %pgbo64_f)
%pgbo64_64 = call <WIDTH x i64>
@__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %pgbo64_64)
%pgbo64_d = call <WIDTH x double>
@__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
@__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pgbo64_d)
%gbo32_8 = call <WIDTH x i8>
@__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %gbo32_8)
%gbo32_16 = call <WIDTH x i16>
@__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %gbo32_16)
%gbo32_32 = call <WIDTH x i32>
@__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %gbo32_32)
%gbo32_f = call <WIDTH x float>
@__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %gbo32_f)
%gbo32_64 = call <WIDTH x i64>
@__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %gbo32_64)
%gbo32_d = call <WIDTH x double>
@__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %gbo32_d)
%gbo64_8 = call <WIDTH x i8>
@__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use8(<WIDTH x i8> %gbo64_8)
%gbo64_16 = call <WIDTH x i16>
@__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use16(<WIDTH x i16> %gbo64_16)
%gbo64_32 = call <WIDTH x i32>
@__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use32(<WIDTH x i32> %gbo64_32)
%gbo64_f = call <WIDTH x float>
@__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usefloat(<WIDTH x float> %gbo64_f)
%gbo64_64 = call <WIDTH x i64>
@__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
@__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__use64(<WIDTH x i64> %gbo64_64)
%gbo64_d = call <WIDTH x double>
@__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %gbo64_d)
@__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__usedouble(<WIDTH x double> %pgbo64_d)
')
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; scatters
@@ -2003,61 +2221,118 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
ifelse(HAVE_SCATTER, `1',
`
call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
',
`
call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
')
ret void
}
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; vector ops
@@ -3196,8 +3471,42 @@ pl_done:
;;
;; $1: scalar type for which to generate functions to do gathers
define(`gen_gather_general', `
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
')
%ret = load <WIDTH x $1> * %ret_ptr
ret <WIDTH x $1> %ret
}
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
')
%ret = load <WIDTH x $1> * %ret_ptr
ret <WIDTH x $1> %ret
}
')
; vec width, type
define(`gen_gather', `
define(`gen_gather_factored', `
;; Define the utility function to do the gather operation for a single element
;; of the type
define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
@@ -3245,7 +3554,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
}
define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3276,7 +3585,7 @@ define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offset
ret <WIDTH x $1> %ret`'eval(WIDTH-1)
}
define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
; We can be clever and avoid the per-lane stuff for gathers if we are willing
@@ -3307,37 +3616,42 @@ define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offset
ret <WIDTH x $1> %ret`'eval(WIDTH-1)
}
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
')
gen_gather_general($1)
'
)
%ret = load <WIDTH x $1> * %ret_ptr
ret <WIDTH x $1> %ret
; vec width, type
define(`gen_gather', `
gen_gather_factored($1)
define <WIDTH x $1>
@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i32> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%scale_vec = bitcast i32 %offset_scale to <1 x i32>
%smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1,
<WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
ret <WIDTH x $1> %v
}
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%ret_ptr = alloca <WIDTH x $1>
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
%val_LANE_ID = load $1 * %ptr_LANE_ID
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
')
%ret = load <WIDTH x $1> * %ret_ptr
ret <WIDTH x $1> %ret
define <WIDTH x $1>
@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
<WIDTH x i64> %offsets,
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
%scale64 = zext i32 %offset_scale to i64
%scale_vec = bitcast i64 %scale64 to <1 x i64>
%smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
%scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
%v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
ret <WIDTH x $1> %v
}
'
)
@@ -3391,7 +3705,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
ret void
}
define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
@@ -3401,7 +3715,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32
ret void
}
define void @__scatter_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
<WIDTH x i32> %mask) nounwind alwaysinline {
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
@@ -3437,3 +3751,48 @@ define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
'
)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rdrand
define(`rdrand_decls', `
declare i1 @__rdrand_i16(i16 * nocapture)
declare i1 @__rdrand_i32(i32 * nocapture)
declare i1 @__rdrand_i64(i64 * nocapture)
')
define(`rdrand_definition', `
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rdrand
declare {i16, i32} @llvm.x86.rdrand.16()
declare {i32, i32} @llvm.x86.rdrand.32()
declare {i64, i32} @llvm.x86.rdrand.64()
define i1 @__rdrand_i16(i16 * %ptr) {
%v = call {i16, i32} @llvm.x86.rdrand.16()
%v0 = extractvalue {i16, i32} %v, 0
%v1 = extractvalue {i16, i32} %v, 1
store i16 %v0, i16 * %ptr
%good = icmp ne i32 %v1, 0
ret i1 %good
}
define i1 @__rdrand_i32(i32 * %ptr) {
%v = call {i32, i32} @llvm.x86.rdrand.32()
%v0 = extractvalue {i32, i32} %v, 0
%v1 = extractvalue {i32, i32} %v, 1
store i32 %v0, i32 * %ptr
%good = icmp ne i32 %v1, 0
ret i1 %good
}
define i1 @__rdrand_i64(i64 * %ptr) {
%v = call {i64, i32} @llvm.x86.rdrand.64()
%v0 = extractvalue {i64, i32} %v, 0
%v1 = extractvalue {i64, i32} %v, 1
store i64 %v0, i64 * %ptr
%good = icmp ne i32 %v1, 0
ret i1 %good
}
')

View File

@@ -140,6 +140,7 @@ Contents:
* `Basic Math Functions`_
* `Transcendental Functions`_
* `Pseudo-Random Numbers`_
* `Random Numbers`_
+ `Output Functions`_
+ `Assertions`_
@@ -2084,7 +2085,7 @@ can be declared:
soa<8> Point pts[...];
The in-memory layout of the ``Point``s has had the SOA transformation
The in-memory layout of the ``Point`` instances has had the SOA transformation
applied, such that there are 8 ``x`` values in memory followed by 8 ``y``
values, and so forth. Here is the effective declaration of ``soa<8>
Point``:
@@ -2266,7 +2267,7 @@ based on C++'s ``new`` and ``delete`` operators:
// use ptr...
delete[] ptr;
In the above code, each program instance allocates its own ``count`-sized
In the above code, each program instance allocates its own ``count`` sized
array of ``uniform int`` values, uses that memory, and then deallocates
that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are
serviced by corresponding calls the system C library's ``malloc()`` and
@@ -2277,9 +2278,7 @@ analogous to the corresponding rules are for pointers (as described in
`Pointer Types`_.) Specifically, if a specific rate qualifier isn't
provided with the ``new`` expression, then the default is that a "varying"
``new`` is performed, where each program instance performs a unique
allocation. The allocated type, in turn, is by default ``uniform`` for
``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new
expressions.
allocation. The allocated type, in turn, is by default ``uniform``.
After a pointer has been deleted, it is illegal to access the memory it
points to. However, that deletion happens on a per-program-instance basis.
@@ -3457,6 +3456,40 @@ be used to get a pseudo-random ``float`` value.
uniform unsigned int32 random(RNGState * uniform state)
uniform float frandom(uniform RNGState * uniform state)
Random Numbers
--------------
Some recent CPUs (including those based on the Intel(r) Ivy Bridge
micro-architecture), provide support for generating true random numbers. A
few standard library functions make this functionality available:
::
bool rdrand(uniform int32 * uniform ptr)
bool rdrand(varying int32 * uniform ptr)
bool rdrand(uniform int32 * varying ptr)
If the processor doesn't have sufficient entropy to generate a random
number, then this function fails and returns ``false``. Otherwise, if the
processor is successful, the random value is stored in the given pointer
and ``true`` is returned. Therefore, this function should generally be
used as follows, called repeatedly until it is successful:
::
int r;
while (rdrand(&r) == false)
; // empty loop body
In addition to the ``int32`` variants of ``rdrand()`` listed above, there
are versions that return ``int16``, ``float``, and ``int64`` values as
well.
Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the
``rdrand()`` functions always return ``false``.
Output Functions
----------------
@@ -3491,7 +3524,7 @@ generates the following output on a four-wide compilation target:
::
i = 10, x = [0.000000,1.000000,2.000000,3.000000]
added to x = [1.000000,2.000000,((2.000000)),((3.000000)]
added to x = [1.000000,2.000000,((2.000000)),((3.000000))]
last print of x = [1.000000,2.000000,2.000000,3.000000]
When a varying variable is printed, the values for program instances that
@@ -4010,8 +4043,8 @@ Systems Programming Support
Atomic Operations and Memory Fences
-----------------------------------
The standard range of atomic memory operations are provided by the standard
library``ispc``, including variants to handle both uniform and varying
The standard set of atomic memory operations are provided by the standard
library, including variants to handle both uniform and varying
types as well as "local" and "global" atomics.
Local atomics provide atomic behavior across the program instances in a

View File

@@ -1307,15 +1307,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
@@ -1362,14 +1360,13 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, \
__vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}

View File

@@ -1375,15 +1375,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec32_i1 mask) { \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec32_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 32; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
@@ -1430,14 +1428,12 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec32_i1 mask) { \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, __vec32_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 32; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}

View File

@@ -1508,15 +1508,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec64_i1 mask) { \
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, __vec64_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
@@ -1540,7 +1538,7 @@ GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_doub
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
VTYPE ret; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)ptrs.v[i]; \
ret.v[i] = *ptr; \
} \
@@ -1563,14 +1561,12 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec64_i1 mask) { \
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
OTYPE offset, VTYPE val, __vec64_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 64; ++i) \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
if ((mask.v & (1ull << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
*ptr = val.v[i]; \
} \
}

View File

@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
// int64
static FORCEINLINE __vec16_i64 __setzero_i64() {
__vec16_i64 ret;
ret.v_lo = _mm512_setzero_epi32();
ret.v_hi = _mm512_setzero_epi32();
return ret;
}
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
{
__mmask16 carry = 0;
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
return src[index+16] | (int64_t(src[index]) << 32);
}
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) {
const int *i = (const int*)&l;
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
}
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext)
CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext)
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
#define CAST_SEXT_I1(TYPE)
/*
static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
CAST_SEXT_I1(__vec16_i16)
CAST_SEXT_I1(__vec16_i32)
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
{
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
}
// zero extension
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
CAST_ZEXT_I1(__vec16_i32)
CAST_ZEXT_I1(__vec16_i64)
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
{
__vec16_i32 ret = _mm512_setzero_epi32();
__vec16_i32 one = _mm512_set1_epi32(1);
return _mm512_mask_mov_epi32(ret, val.m, one);
}
// truncations
CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
@@ -1589,11 +1604,6 @@ CAST_BITS_SCALAR(double, int64_t)
///////////////////////////////////////////////////////////////////////////
// various math functions
/*
static FORCEINLINE void __fastmath() {
}
*/
static FORCEINLINE float __round_uniform_float(float v) {
return roundf(v);
}
@@ -1659,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
return _mm512_gmin_ps(v1, v2);
}
static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_max_epi32(v1, v2);
}
static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_min_epi32(v1, v2);
}
static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_max_epu32(v1, v2);
}
static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
return _mm512_min_epu32(v1, v2);
}
BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
@@ -1940,60 +1961,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
// offsets * offsetScale is in bytes (for all of these)
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
/*
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
__vec16_i1 mask) { \
VTYPE ret; \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
ret.v[i] = *ptr; \
} \
return ret; \
}
*/
static FORCEINLINE __vec16_i32
__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__vec16_i1 mask) {
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
__vec16_i32 tmp;
// Loop is generated by intrinsic
__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_i32 tmp = _mm512_undefined_epi32();
__vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base,
_MM_UPCONV_EPI32_NONE, 1,
_MM_UPCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
return ret;
}
static FORCEINLINE __vec16_f
__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
__vec16_i1 mask) {
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
__vec16_f tmp;
// Loop is generated by intrinsic
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
_MM_UPCONV_PS_NONE, 1,
__vec16_f tmp = _mm512_undefined_ps();
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
_MM_UPCONV_PS_NONE, scale,
_MM_HINT_NONE);
return ret;
}
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
/*
@@ -2039,47 +2033,43 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
*/
// scatter
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
/*
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
uint32_t scale, OTYPE constOffset, \
VTYPE val, __vec16_i1 mask) { \
int8_t *base = (int8_t *)b; \
for (int i = 0; i < 16; ++i) \
if ((mask.v & (1 << i)) != 0) { \
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
constOffset.v[i]); \
*ptr = val.v[i]; \
} \
}
*/
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
static FORCEINLINE void
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
uint32_t scale, __vec16_i32 constOffset,
__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
__vec16_i32 val, __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val,
_MM_DOWNCONV_EPI32_NONE, scale,
_MM_HINT_NONE);
}
static FORCEINLINE void
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
uint32_t scale, const __vec16_i32 &constOffset,
const __vec16_f &val, const __vec16_i1 mask)
__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
__vec16_f val, __vec16_i1 mask)
{
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
_mm512_mask_i32extscatter_ps(base, mask, offsets, val,
_MM_DOWNCONV_PS_NONE, scale,
_MM_HINT_NONE);
}
/*
static FORCEINLINE void
__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
uint32_t scale, const __vec16_i64 &constOffset,
const __vec16_f &val, const __vec16_i1 mask)
{
__vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
_mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
}
*/
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
/*
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \

View File

@@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
__vec4_i32 offsets, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __setzero_i32());
constOffset = __select(mask, constOffset, __setzero_i32());
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
offset = scale * _mm_extract_epi32(offsets.v, 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
offset = scale * _mm_extract_epi32(offsets.v, 2);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
offset = scale * _mm_extract_epi32(offsets.v, 3);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
int offset = scale * _mm_extract_epi32(offsets.v, 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
int offset = scale * _mm_extract_epi32(offsets.v, 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
int offset = scale * _mm_extract_epi32(offsets.v, 2);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
int offset = scale * _mm_extract_epi32(offsets.v, 3);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
template<typename RetVec, typename RetScalar>
static FORCEINLINE RetVec
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
__vec4_i64 offsets, __vec4_i1 mask) {
RetScalar r[4];
#if 1
// "Fast gather" trick...
offsets = __select(mask, offsets, __setzero_i64());
constOffset = __select(mask, constOffset, __setzero_i64());
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
#else
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[0] = *ptr;
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[1] = *ptr;
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
RetScalar *ptr = (RetScalar *)(p + offset);
r[2] = *ptr;
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
RetScalar *ptr = (RetScalar *)(p + offset);
r[3] = *ptr;
}
@@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
}
static FORCEINLINE __vec4_i8
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i8
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i16
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
}
static FORCEINLINE __vec4_i32
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i32
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_f
__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
__vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_f
__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_i64
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_d
__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask);
}
static FORCEINLINE __vec4_d
__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
constOffset, mask);
__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i1 mask) {
return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask);
}
template<typename RetVec, typename RetScalar>
@@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
static FORCEINLINE void \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
uint32_t scale, __vec4_i32 constOffset, \
__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
__vec4_i32 offsets, \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
_mm_extract_epi32(constOffset.v, 0)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
_mm_extract_epi32(constOffset.v, 1)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
_mm_extract_epi32(constOffset.v, 2)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
_mm_extract_epi32(constOffset.v, 3)); \
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \
*ptr = EXTRACT(val.v, 3); \
} \
} \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
uint32_t scale, __vec4_i64 constOffset, \
static FORCEINLINE void \
__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \
__vec4_i64 offsets, \
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
uint32_t m = _mm_extract_ps(mask.v, 0); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
_mm_extract_epi64(constOffset.v[0], 0); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 0); \
} \
m = _mm_extract_ps(mask.v, 1); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
_mm_extract_epi64(constOffset.v[0], 1); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 1); \
} \
m = _mm_extract_ps(mask.v, 2); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
_mm_extract_epi64(constOffset.v[1], 0); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 2); \
} \
m = _mm_extract_ps(mask.v, 3); \
if (m != 0) { \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
_mm_extract_epi64(constOffset.v[1], 1); \
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \
TYPE *ptr = (TYPE *)(p + offset); \
*ptr = EXTRACT(val.v, 3); \
} \
@@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
static FORCEINLINE void
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
__vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
_mm_extract_epi32(constOffset.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
_mm_extract_epi32(constOffset.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
_mm_extract_epi32(constOffset.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
_mm_extract_epi32(constOffset.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset,
__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
_mm_extract_epi64(constOffset.v[0], 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
_mm_extract_epi64(constOffset.v[0], 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
_mm_extract_epi64(constOffset.v[1], 0);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
_mm_extract_epi64(constOffset.v[1], 1);
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
uint32_t scale, __vec4_i32 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_i64 val, __vec4_i1 mask) {
uint32_t m = _mm_extract_ps(mask.v, 0);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 0);
}
m = _mm_extract_ps(mask.v, 1);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[0], 1);
}
m = _mm_extract_ps(mask.v, 2);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 0);
}
m = _mm_extract_ps(mask.v, 3);
if (m != 0) {
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
uint64_t *ptr = (uint64_t *)(p + offset);
*ptr = _mm_extract_epi64(val.v[1], 1);
}
}
static FORCEINLINE void
__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
uint32_t scale, __vec4_i64 constOffset, __vec4_d val,
__vec4_i1 mask) {
__scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
__vec4_d val, __vec4_i1 mask) {
__scatter_base_offsets32_i64(p, scale, offsets, val, mask);
}
static FORCEINLINE void
__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
__vec4_d val, __vec4_i1 mask) {
__scatter_base_offsets64_i64(p, scale, offsets, val, mask);
}

View File

@@ -94,20 +94,22 @@ lGetSystemISA() {
int info[4];
__cpuid(info, 1);
if ((info[2] & (1 << 28)) != 0) {
// AVX1 for sure. Do we have AVX2?
// Call cpuid with eax=7, ecx=0
__cpuidex(info, 7, 0);
if ((info[1] & (1 << 5)) != 0)
return "avx2";
else {
// ivybridge?
if ((info[2] & (1 << 29)) != 0 && // F16C
(info[2] & (1 << 30)) != 0) // RDRAND
return "avx1.1";
if ((info[2] & (1 << 28)) != 0) { // AVX
// AVX1 for sure....
// Ivy Bridge?
if ((info[2] & (1 << 29)) != 0 && // F16C
(info[2] & (1 << 30)) != 0) { // RDRAND
// So far, so good. AVX2?
// Call cpuid with eax=7, ecx=0
int info2[4];
__cpuidex(info2, 7, 0);
if ((info2[1] & (1 << 5)) != 0)
return "avx2";
else
return "avx";
return "avx1.1";
}
// Regular AVX
return "avx";
}
else if ((info[2] & (1 << 19)) != 0)
return "sse4";
@@ -212,6 +214,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
// This is the case for most of them
t->hasHalf = t->hasRand = t->hasTranscendentals = false;
t->hasGather = t->hasScatter = false;
if (!strcasecmp(isa, "sse2")) {
t->isa = Target::SSE2;
@@ -253,6 +256,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-8")) {
t->isa = Target::GENERIC;
@@ -262,6 +266,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-16")) {
t->isa = Target::GENERIC;
@@ -271,6 +276,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-32")) {
t->isa = Target::GENERIC;
@@ -280,6 +286,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-64")) {
t->isa = Target::GENERIC;
@@ -289,6 +296,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskBitCount = 1;
t->hasHalf = true;
t->hasTranscendentals = true;
t->hasGather = t->hasScatter = true;
}
else if (!strcasecmp(isa, "generic-1")) {
t->isa = Target::GENERIC;
@@ -320,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
t->maskingIsFree = false;
t->maskBitCount = 32;
#if !defined(LLVM_3_0)
// LLVM 3.1+ only
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
#endif
#endif
}
else if (!strcasecmp(isa, "avx1.1-x2")) {
t->isa = Target::AVX11;
@@ -330,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
t->maskingIsFree = false;
t->maskBitCount = 32;
#if !defined(LLVM_3_0)
// LLVM 3.1+ only
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
#endif
#endif
}
#ifndef LLVM_3_0
else if (!strcasecmp(isa, "avx2")) {
@@ -342,7 +362,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskingIsFree = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
t->hasGather = true;
#endif
}
else if (!strcasecmp(isa, "avx2-x2")) {
t->isa = Target::AVX2;
@@ -352,7 +376,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
t->maskingIsFree = false;
t->maskBitCount = 32;
t->hasHalf = true;
#if !defined(LLVM_3_1)
// LLVM 3.2+ only
t->hasRand = true;
t->hasGather = true;
#endif
}
#endif // !LLVM_3_0
else {

8
ispc.h
View File

@@ -252,9 +252,15 @@ struct Target {
conversions. */
bool hasHalf;
/** Indicates whether there is an ISA random number instruciton. */
/** Indicates whether there is an ISA random number instruction. */
bool hasRand;
/** Indicates whether the target has a native gather instruction */
bool hasGather;
/** Indicates whether the target has a native scatter instruction */
bool hasScatter;
/** Indicates whether the target has support for transcendentals (beyond
sqrt, which we assume that all of them handle). */
bool hasTranscendentals;

843
opt.cpp

File diff suppressed because it is too large Load Diff

View File

@@ -265,11 +265,9 @@ def run_test(filename):
gcc_arch = '-m64'
gcc_isa=""
if options.target == 'sse2' or options.target == 'sse2-x2':
gcc_isa = '-msse3'
if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4':
if options.target == 'generic-4':
gcc_isa = '-msse4.2'
if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8':
if options.target == 'generic-8':
gcc_isa = '-mavx'
if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):

View File

@@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state,
static inline void fastmath() {
__fastmath();
}
///////////////////////////////////////////////////////////////////////////
// rdrand
static inline uniform bool rdrand(float * uniform ptr) {
if (__have_native_rand == false)
return false;
else {
uniform int32 irand;
uniform bool success = __rdrand_i32(&irand);
if (success) {
irand &= (1<<23)-1;
*ptr = floatbits(0x3F800000 | irand)-1.0f;
}
return success;
}
}
static inline bool rdrand(varying float * uniform ptr) {
if (__have_native_rand == false)
return false;
else {
bool success = false;
foreach_active (index) {
uniform int32 irand;
if (__rdrand_i32(&irand)) {
// FIXME: it probably would be preferable, here and in the
// following rdrand() function, to do the int->float stuff
// in vector form. However, we need to be careful to not
// clobber any existing already-set values in *ptr with
// inactive lanes here...
irand &= (1<<23)-1;
*ptr = floatbits(0x3F800000 | irand)-1.0f;
success = true;
}
}
return success;
}
}
static inline bool rdrand(float * ptr) {
if (__have_native_rand == false)
return false;
else {
float * uniform ptrs[programCount];
ptrs[programIndex] = ptr;
bool success = false;
foreach_active (index) {
uniform int32 irand;
if (__rdrand_i32(&irand)) {
irand &= (1<<23)-1;
*ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
success = true;
}
}
return success;
}
}
static inline uniform bool rdrand(int16 * uniform ptr) {
if (__have_native_rand == false)
return false;
else
return __rdrand_i16(ptr);
}
static inline bool rdrand(varying int16 * uniform ptr) {
if (__have_native_rand == false)
return false;
else {
bool success = false;
foreach_active (index) {
uniform int16 irand;
if (__rdrand_i16(&irand)) {
*ptr = irand;
success = true;
}
}
return success;
}
}
static inline bool rdrand(int16 * ptr) {
if (__have_native_rand == false)
return false;
else {
int16 * uniform ptrs[programCount];
ptrs[programIndex] = ptr;
bool success = false;
foreach_active (index) {
uniform int16 irand;
if (__rdrand_i16(&irand)) {
*ptrs[index] = irand;
success = true;
}
}
return success;
}
}
static inline uniform bool rdrand(int32 * uniform ptr) {
if (__have_native_rand == false)
return false;
else
return __rdrand_i32(ptr);
}
static inline bool rdrand(varying int32 * uniform ptr) {
if (__have_native_rand == false)
return false;
else {
bool success = false;
foreach_active (index) {
uniform int32 irand;
if (__rdrand_i32(&irand)) {
*ptr = irand;
success = true;
}
}
return success;
}
}
static inline bool rdrand(int32 * ptr) {
if (__have_native_rand == false)
return false;
else {
int32 * uniform ptrs[programCount];
ptrs[programIndex] = ptr;
bool success = false;
foreach_active (index) {
uniform int32 irand;
if (__rdrand_i32(&irand)) {
*ptrs[index] = irand;
success = true;
}
}
return success;
}
}
static inline uniform bool rdrand(int64 * uniform ptr) {
if (__have_native_rand == false)
return false;
else
return __rdrand_i64(ptr);
}
static inline bool rdrand(varying int64 * uniform ptr) {
if (__have_native_rand == false)
return false;
else {
bool success = false;
foreach_active (index) {
uniform int64 irand;
if (__rdrand_i64(&irand)) {
*ptr = irand;
success = true;
}
}
return success;
}
}
static inline bool rdrand(int64 * ptr) {
if (__have_native_rand == false)
return false;
else {
int64 * uniform ptrs[programCount];
ptrs[programIndex] = ptr;
bool success = false;
foreach_active (index) {
uniform int64 irand;
if (__rdrand_i64(&irand)) {
*ptrs[index] = irand;
success = true;
}
}
return success;
}
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform double a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

17
tests/gather-float-1.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

17
tests/gather-float-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

21
tests/gather-float-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/gather-float-4.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

19
tests/gather-float-5.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

19
tests/gather-float-6.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

23
tests/gather-float-7.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

23
tests/gather-float-8.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform float a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

View File

@@ -1,19 +1,17 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int16 x[programCount];
x[programIndex] = programIndex;
int a = aFOO[programIndex]-1;
unsigned int16 v;
if (programIndex < 2)
v = x[a];
else
v = 2;
RET[programIndex] = v;
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2;
RET[0] = 0;
RET[1] = 1;
RET[programIndex] = 1 + programIndex;
}

17
tests/gather-int16-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

21
tests/gather-int16-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/gather-int16-4.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

19
tests/gather-int16-5.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

19
tests/gather-int16-6.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

23
tests/gather-int16-7.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

23
tests/gather-int16-8.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int16 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

17
tests/gather-int32-1.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

17
tests/gather-int32-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

21
tests/gather-int32-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/gather-int32-4.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

19
tests/gather-int32-5.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

19
tests/gather-int32-6.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

23
tests/gather-int32-7.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

23
tests/gather-int32-8.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

17
tests/gather-int64-1.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

17
tests/gather-int64-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

21
tests/gather-int64-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/gather-int64-4.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

19
tests/gather-int64-5.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

19
tests/gather-int64-6.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

23
tests/gather-int64-7.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

23
tests/gather-int64-8.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int64 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

View File

@@ -1,19 +1,17 @@
export uniform int width() { return programCount; }
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
uniform int8 x[programCount];
x[programIndex] = programIndex;
int a = aFOO[programIndex]-1;
unsigned int8 v;
if (programIndex < 2)
v = x[a];
else
v = 2;
RET[programIndex] = v;
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 2;
RET[0] = 0;
RET[1] = 1;
RET[programIndex] = 1 + programIndex;
}

17
tests/gather-int8-2.ispc Normal file
View File

@@ -0,0 +1,17 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

21
tests/gather-int8-3.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/gather-int8-4.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
int64 zero = 0;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
if (programIndex < 2)
g = a[programIndex+zero];
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

19
tests/gather-int8-5.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

19
tests/gather-int8-6.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
int g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 1 + programIndex;
}

23
tests/gather-int8-7.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

23
tests/gather-int8-8.ispc Normal file
View File

@@ -0,0 +1,23 @@
export uniform int width() { return programCount; }
int64 zero = 0;
void *gptr;
export void f_f(uniform float RET[], uniform float aFOO[]) {
uniform int8 a[programCount];
for (uniform int i = 0; i < programCount; ++i)
a[i] = aFOO[i];
int g = 0;
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
if (programIndex < 2)
g = *ptr;
RET[programIndex] = g;
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
RET[0] = 1;
RET[1] = 2;
}

21
tests/rdrand-1.ispc Normal file
View File

@@ -0,0 +1,21 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 1;
#else
uniform float r = -1;
uniform int count = 0;
while (!rdrand(&r)) {
++count;
}
RET[programIndex] = (r >= 0 && r < 1);
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 1;
}

19
tests/rdrand-2.ispc Normal file
View File

@@ -0,0 +1,19 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 1;
#else
float r = -1;
while (!rdrand(&r))
;
RET[programIndex] = (r >= 0 && r < 1);
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 1;
}

25
tests/rdrand-3.ispc Normal file
View File

@@ -0,0 +1,25 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 1;
#else
int lessHalf = 0, moreHalf = 0;
for (uniform int i = 0; i < 1024*1024; ++i) {
float r = -1;
while (!rdrand(&r))
;
if (r < 0.5) ++lessHalf;
else ++moreHalf;
}
float r = (double)lessHalf / (double)(lessHalf + moreHalf);
RET[programIndex] = (r >= .49 && r < .51);
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 1;
}

33
tests/rdrand-4.ispc Normal file
View File

@@ -0,0 +1,33 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 0;
#else
uniform int set[64] = { 0 };
uniform int count = 1024*1024;
for (uniform int i = 0; i < count; ++i) {
uniform int64 r;
while (!rdrand(&r))
;
for (uniform int b = 0; b < 64; ++b)
if (((unsigned int64)r >> b) & 1)
++set[b];
}
RET[programIndex] = 0;
for (uniform int b = 0; b < 64; ++b) {
float r = (double)set[b] / (double)(count);
if (!(r >= .49 && r < .51)) {
print("% % - %\n", b, r, set[b]);
++RET[programIndex];
}
}
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

33
tests/rdrand-5.ispc Normal file
View File

@@ -0,0 +1,33 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 0;
#else
int set[32] = { 0 };
uniform int count = 1024*1024;
for (uniform int i = 0; i < count; ++i) {
int32 r;
while (!rdrand(&r))
;
for (uniform int b = 0; b < 32; ++b)
if (((unsigned int32)r >> b) & 1)
++set[b];
}
RET[programIndex] = 0;
for (uniform int b = 0; b < 32; ++b) {
float r = (double)set[b] / (double)(count);
if (!(r >= .49 && r < .51)) {
print("% % - %\n", b, r, set[b]);
++RET[programIndex];
}
}
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}

35
tests/rdrand-6.ispc Normal file
View File

@@ -0,0 +1,35 @@
export uniform int width() { return programCount; }
export void f_f(uniform float RET[], uniform float aFOO[]) {
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
RET[programIndex] = 0;
#else
int set[32] = { 0 };
uniform int count = 1024*1024;
for (uniform int i = 0; i < count; ++i) {
uniform int32 rr[programCount];
int * ptr = rr + programIndex;
while (!rdrand(ptr))
;
int32 r = rr[programIndex];
for (uniform int b = 0; b < 32; ++b)
if (((unsigned int32)r >> b) & 1)
++set[b];
}
RET[programIndex] = 0;
for (uniform int b = 0; b < 32; ++b) {
float r = (double)set[b] / (double)(count);
if (!(r >= .49 && r < .51)) {
print("% % - %\n", b, r, set[b]);
++RET[programIndex];
}
}
#endif
}
export void result(uniform float RET[]) {
RET[programIndex] = 0;
}