Merge branch 'master' of https://github.com/ispc/ispc
This commit is contained in:
@@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) {
|
||||
"__prefetch_read_uniform_nt",
|
||||
"__rcp_uniform_float",
|
||||
"__rcp_varying_float",
|
||||
"__rdrand_i16",
|
||||
"__rdrand_i32",
|
||||
"__rdrand_i64",
|
||||
"__reduce_add_double",
|
||||
"__reduce_add_float",
|
||||
"__reduce_add_int32",
|
||||
|
||||
@@ -76,18 +76,19 @@ declare void @abort() noreturn
|
||||
;; /* NOTE: the values returned below must be the same as the
|
||||
;; corresponding enumerant values in Target::ISA. */
|
||||
;; if ((info[2] & (1 << 28)) != 0) {
|
||||
;; // AVX1 for sure. Do we have AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; __cpuid_count(info, 7, 0);
|
||||
;; if ((info[1] & (1 << 5)) != 0)
|
||||
;; return 4; // AVX2
|
||||
;; else {
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) // RDRAND
|
||||
;; return 3; // AVX1 on IVB
|
||||
;; else
|
||||
;; return 2; // AVX1
|
||||
;; }
|
||||
;; if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
;; (info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
;; // So far, so good. AVX2?
|
||||
;; // Call cpuid with eax=7, ecx=0
|
||||
;; int info2[4];
|
||||
;; __cpuid_count(info2, 7, 0);
|
||||
;; if ((info2[1] & (1 << 5)) != 0)
|
||||
;; return 4;
|
||||
;; else
|
||||
;; return 3;
|
||||
;; }
|
||||
;; // Regular AVX
|
||||
;; return 2;
|
||||
;; }
|
||||
;; else if ((info[2] & (1 << 19)) != 0)
|
||||
;; return 1; // SSE4
|
||||
@@ -104,40 +105,37 @@ entry:
|
||||
%asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3
|
||||
%and = and i32 %asmresult5.i, 268435456
|
||||
%cmp = icmp eq i32 %and, 0
|
||||
br i1 %cmp, label %if.else14, label %if.then
|
||||
br i1 %cmp, label %if.else13, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1
|
||||
%and3 = and i32 %asmresult4.i29, 32
|
||||
%cmp4 = icmp eq i32 %and3, 0
|
||||
br i1 %cmp4, label %if.else, label %return
|
||||
%1 = and i32 %asmresult5.i, 1610612736
|
||||
%2 = icmp eq i32 %1, 1610612736
|
||||
br i1 %2, label %if.then7, label %return
|
||||
|
||||
if.else: ; preds = %if.then
|
||||
%asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2
|
||||
%2 = and i32 %asmresult5.i30, 1610612736
|
||||
%3 = icmp eq i32 %2, 1610612736
|
||||
br i1 %3, label %return, label %if.else13
|
||||
|
||||
if.else13: ; preds = %if.else
|
||||
if.then7: ; preds = %if.then
|
||||
%3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind
|
||||
%asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1
|
||||
%and10 = lshr i32 %asmresult4.i28, 5
|
||||
%4 = and i32 %and10, 1
|
||||
%5 = add i32 %4, 3
|
||||
br label %return
|
||||
|
||||
if.else14: ; preds = %entry
|
||||
%and16 = and i32 %asmresult5.i, 524288
|
||||
%cmp17 = icmp eq i32 %and16, 0
|
||||
br i1 %cmp17, label %if.else19, label %return
|
||||
if.else13: ; preds = %entry
|
||||
%and15 = and i32 %asmresult5.i, 524288
|
||||
%cmp16 = icmp eq i32 %and15, 0
|
||||
br i1 %cmp16, label %if.else18, label %return
|
||||
|
||||
if.else19: ; preds = %if.else14
|
||||
%and21 = and i32 %asmresult6.i, 67108864
|
||||
%cmp22 = icmp eq i32 %and21, 0
|
||||
br i1 %cmp22, label %if.else24, label %return
|
||||
if.else18: ; preds = %if.else13
|
||||
%and20 = and i32 %asmresult6.i, 67108864
|
||||
%cmp21 = icmp eq i32 %and20, 0
|
||||
br i1 %cmp21, label %if.else23, label %return
|
||||
|
||||
if.else24: ; preds = %if.else19
|
||||
if.else23: ; preds = %if.else18
|
||||
tail call void @abort() noreturn nounwind
|
||||
unreachable
|
||||
|
||||
return: ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then
|
||||
%retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ]
|
||||
return: ; preds = %if.else18, %if.else13, %if.then7, %if.then
|
||||
%retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ]
|
||||
ret i32 %retval.0
|
||||
}
|
||||
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
@@ -31,6 +31,8 @@
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -71,9 +73,9 @@ declare <WIDTH x i16> @__float_to_half_varying(<WIDTH x float> %v) nounwind read
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
@@ -29,13 +29,55 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`NO_HALF_DECLARES', `1')
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
include(`target-avx1-x2.ll')
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <16 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -86,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
'
|
||||
)
|
||||
|
||||
@@ -29,13 +29,55 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
define(`NO_HALF_DECLARES', `1')
|
||||
include(`target-avx.ll')
|
||||
|
||||
include(`target-avx1.ll')
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; unsigned int min/max
|
||||
|
||||
define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline {
|
||||
binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1)
|
||||
ret <8 x i32> %ret
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -70,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,8 +29,16 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx-x2.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
|
||||
%$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: type
|
||||
; $2: var base name
|
||||
define(`extract_8s', `
|
||||
%$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef,
|
||||
<8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
define(`assemble_8s', `
|
||||
%$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4,
|
||||
<16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
|
||||
i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
|
||||
')
|
||||
|
||||
; $1: element type
|
||||
; $2: ret name
|
||||
; $3: v1
|
||||
; $4: v2
|
||||
; $5: v3
|
||||
; $6: v4
|
||||
define(`assemble_4s', `
|
||||
%$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
%$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
assemble_8s($1, $2, $2_1, $2_2)
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather32_i32(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_8s(i32, ptrs)
|
||||
extract_8s(i32, vecmask)
|
||||
|
||||
%v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1)
|
||||
|
||||
assemble_8s(i32, v, v1, v2)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i32> @__gather64_i32(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i32, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <16 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(i32, offsets)
|
||||
extract_8s(float, mask)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather32_float(<16 x i32> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_8s(float, mask)
|
||||
extract_8s(i32, ptrs)
|
||||
|
||||
%v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1)
|
||||
%v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1)
|
||||
|
||||
assemble_8s(float, v, v1, v2)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x float> @__gather64_float(<16 x i64> %ptrs,
|
||||
<16 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <16 x i32> %vecmask to <16 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
%v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1)
|
||||
%v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1)
|
||||
|
||||
assemble_4s(float, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x i64> @__gather32_i64(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
define <16 x i64> @__gather64_i64(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <16 x i32> %mask32 to <16 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(i64, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <16 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i32> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <16 x i64> %offsets,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather32_double(<16 x i32> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
|
||||
define <16 x double> @__gather64_double(<16 x i64> %ptrs,
|
||||
<16 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <16 x i32> %mask32 to <16 x i64>
|
||||
%vecmask = bitcast <16 x i64> %vecmask64 to <16 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
%v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1)
|
||||
%v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1)
|
||||
|
||||
assemble_4s(double, v, v1, v2, v3, v4)
|
||||
|
||||
ret <16 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
;; Copyright (c) 2010-2011, Intel Corporation
|
||||
;; Copyright (c) 2010-2012, Intel Corporation
|
||||
;; All rights reserved.
|
||||
;;
|
||||
;; Redistribution and use in source and binary forms, with or without
|
||||
@@ -29,8 +29,16 @@
|
||||
;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `',
|
||||
LLVM_VERSION, `LLVM_3_1', `',
|
||||
`define(`HAVE_GATHER', `1')')
|
||||
|
||||
include(`target-avx.ll')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()',
|
||||
LLVM_VERSION, `LLVM_3_1', `rdrand_decls()',
|
||||
`rdrand_definition()')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int min/max
|
||||
|
||||
@@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float/half conversions
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
;; nothing to define...
|
||||
', `
|
||||
declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone
|
||||
; 0 is round nearest even
|
||||
declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone
|
||||
@@ -100,13 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone {
|
||||
%r = extractelement <8 x i16> %rv, i32 0
|
||||
ret i16 %r
|
||||
}
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather
|
||||
|
||||
declare void @llvm.trap() noreturn nounwind
|
||||
|
||||
define(`extract_4s', `
|
||||
%$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
%$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
||||
')
|
||||
|
||||
ifelse(LLVM_VERSION, `LLVM_3_0', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)',
|
||||
LLVM_VERSION, `LLVM_3_1', `
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)', `
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int32 gathers
|
||||
|
||||
declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8)
|
||||
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
extract_4s(i32, vecmask)
|
||||
extract_4s(i64, offsets)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather32_i32(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x i32> %vecmask, i8 1)
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i32> @__gather64_i32(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i32, vecmask)
|
||||
|
||||
%v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i32> %v1, <4 x i32> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i32> %v
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; float gathers
|
||||
|
||||
declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr,
|
||||
<8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind
|
||||
declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind
|
||||
|
||||
define <8 x float> @__gather_base_offsets32_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr,
|
||||
<8 x i32> %offsets, <8 x float> %mask, i8 %scale8)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather_base_offsets64_float(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather32_float(<8 x i32> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
|
||||
%v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null,
|
||||
<8 x i32> %ptrs, <8 x float> %mask, i8 1)
|
||||
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x float> @__gather64_float(<8 x i64> %ptrs,
|
||||
<8 x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%mask = bitcast <8 x i32> %vecmask to <8 x float>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(float, mask)
|
||||
|
||||
%v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1)
|
||||
%v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x float> %v1, <4 x float> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x float> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; int64 gathers
|
||||
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather32_i64(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
|
||||
define <8 x i64> @__gather64_i64(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask = sext <8 x i32> %mask32 to <8 x i64>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(i64, vecmask)
|
||||
|
||||
%v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x i64> %v1, <4 x i64> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x i64> %v
|
||||
}
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; double gathers
|
||||
|
||||
declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr,
|
||||
<4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind
|
||||
|
||||
define <8 x double> @__gather_base_offsets32_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i32> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather_base_offsets64_double(i8 * %ptr,
|
||||
i32 %scale, <8 x i64> %offsets,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%scale8 = trunc i32 %scale to i8
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, offsets)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr,
|
||||
<4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather32_double(<8 x i32> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i32, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
define <8 x double> @__gather64_double(<8 x i64> %ptrs,
|
||||
<8 x i32> %mask32) nounwind readonly alwaysinline {
|
||||
%vecmask64 = sext <8 x i32> %mask32 to <8 x i64>
|
||||
%vecmask = bitcast <8 x i64> %vecmask64 to <8 x double>
|
||||
extract_4s(i64, ptrs)
|
||||
extract_4s(double, vecmask)
|
||||
|
||||
%v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1)
|
||||
%v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null,
|
||||
<4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1)
|
||||
|
||||
%v = shufflevector <4 x double> %v1, <4 x double> %v2,
|
||||
<8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
||||
|
||||
ret <8 x double> %v
|
||||
}
|
||||
|
||||
')
|
||||
|
||||
12
builtins/target-generic-1.ll
Executable file → Normal file
12
builtins/target-generic-1.ll
Executable file → Normal file
@@ -34,12 +34,12 @@ masked_load(double, 8)
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
|
||||
@@ -32,11 +32,15 @@
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32";
|
||||
|
||||
define(`MASK',`i1')
|
||||
define(`HAVE_GATHER',`1')
|
||||
define(`HAVE_SCATTER',`1')
|
||||
|
||||
include(`util.m4')
|
||||
|
||||
stdlib_core()
|
||||
scans()
|
||||
reduce_equal(WIDTH)
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; broadcast/rotate/shuffle
|
||||
@@ -334,19 +338,19 @@ define void @__masked_store_blend_double(<WIDTH x double>* nocapture,
|
||||
;; gather/scatter
|
||||
|
||||
define(`gather_scatter', `
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets32_$1(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather_base_offsets64_$1(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather32_$1(<WIDTH x i32>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
declare <WIDTH x $1> @__gather64_$1(<WIDTH x i64>,
|
||||
<WIDTH x i1>) nounwind readonly
|
||||
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, <WIDTH x i32>,
|
||||
i32, <WIDTH x i32>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, <WIDTH x i64>,
|
||||
i32, <WIDTH x i64>, <WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x $1>, <WIDTH x i1>) nounwind
|
||||
declare void @__scatter32_$1(<WIDTH x i32>, <WIDTH x $1>,
|
||||
<WIDTH x i1>) nounwind
|
||||
declare void @__scatter64_$1(<WIDTH x i64>, <WIDTH x $1>,
|
||||
|
||||
@@ -33,6 +33,7 @@ ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rcp
|
||||
|
||||
@@ -444,12 +444,12 @@ masked_load(double, 8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
|
||||
@@ -575,12 +575,12 @@ masked_load(double, 8)
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
|
||||
@@ -33,6 +33,7 @@ ctlztz()
|
||||
define_prefetches()
|
||||
define_shuffles()
|
||||
aossoa()
|
||||
rdrand_decls()
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rounding floats
|
||||
|
||||
@@ -371,12 +371,12 @@ masked_load(double, 8)
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; gather/scatter
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
|
||||
@@ -474,12 +474,12 @@ masked_load(double, 8)
|
||||
|
||||
; define these with the macros from stdlib.m4
|
||||
|
||||
gen_gather(i8)
|
||||
gen_gather(i16)
|
||||
gen_gather(i32)
|
||||
gen_gather(float)
|
||||
gen_gather(i64)
|
||||
gen_gather(double)
|
||||
gen_gather_factored(i8)
|
||||
gen_gather_factored(i16)
|
||||
gen_gather_factored(i32)
|
||||
gen_gather_factored(float)
|
||||
gen_gather_factored(i64)
|
||||
gen_gather_factored(double)
|
||||
|
||||
gen_scatter(i8)
|
||||
gen_scatter(i16)
|
||||
|
||||
775
builtins/util.m4
775
builtins/util.m4
@@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
|
||||
|
||||
; Declare the pseudo-gather functions. When the ispc front-end needs
|
||||
; to perform a gather, it generates a call to one of these functions,
|
||||
; which have signatures:
|
||||
; which ideally have these signatures:
|
||||
;
|
||||
; varying int8 __pseudo_gather_i8(varying int8 *, mask)
|
||||
; varying int16 __pseudo_gather_i16(varying int16 *, mask)
|
||||
@@ -1588,24 +1588,9 @@ declare void @__pseudo_masked_store_double(<WIDTH x double> * nocapture, <WIDTH
|
||||
; varying int64 __pseudo_gather_i64(varying int64 *, mask)
|
||||
; varying double __pseudo_gather_double(varying double *, mask)
|
||||
;
|
||||
; The GatherScatterFlattenOpt optimization pass finds these calls and then
|
||||
; converts them to make calls to the following functions (when appropriate);
|
||||
; these represent gathers from a common base pointer with offsets. The
|
||||
; offset_scale factor scales the offsets before they are added to the base
|
||||
; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.)
|
||||
; Then, the offset delta_value (guaranteed to be a compile-time constant value),
|
||||
; is added to the final address. The 2, 4, 8 scales are used to match LLVM patterns
|
||||
; that use the free 2/4/8 scaling available in x86 addressing calculations, and
|
||||
; offset_delta feeds into the free offset calculation.
|
||||
;
|
||||
; varying int{8,16,32,float,64,double}
|
||||
; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
|
||||
; int{32,64} offsets, uniform int32 offset_scale,
|
||||
; int{32,64} offset_delta, mask)
|
||||
;
|
||||
; Then, the GSImprovementsPass optimizations finds these and either
|
||||
; converts them to native gather functions or converts them to vector
|
||||
; loads, if equivalent.
|
||||
; However, vectors of pointers weren not legal in LLVM until recently, so
|
||||
; instead, it emits calls to functions that either take vectors of int32s
|
||||
; or int64s, depending on the compilation target.
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather32_i8(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather32_i16(<WIDTH x i32>, <WIDTH x MASK>) nounwind readonly
|
||||
@@ -1621,31 +1606,106 @@ declare <WIDTH x float> @__pseudo_gather64_float(<WIDTH x i64>, <WIDTH x MASK>)
|
||||
declare <WIDTH x i64> @__pseudo_gather64_i64(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double> @__pseudo_gather64_double(<WIDTH x i64>, <WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float> @__pseudo_gather_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double> @__pseudo_gather_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
; The ImproveMemoryOps optimization pass finds these calls and then
|
||||
; tries to convert them to be calls to gather functions that take a uniform
|
||||
; base pointer and then a varying integer offset, when possible.
|
||||
;
|
||||
; For targets without a native gather instruction, it is best to factor the
|
||||
; integer offsets like "{1/2/4/8} * varying_offset + constant_offset",
|
||||
; where varying_offset includes non-compile time constant values, and
|
||||
; constant_offset includes compile-time constant values. (The scalar loads
|
||||
; generated in turn can then take advantage of the free offsetting and scale by
|
||||
; 1/2/4/8 that is offered by the x86 addresisng modes.)
|
||||
;
|
||||
; varying int{8,16,32,float,64,double}
|
||||
; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
|
||||
; int{32,64} offsets, uniform int32 offset_scale,
|
||||
; int{32,64} offset_delta, mask)
|
||||
;
|
||||
; For targets with a gather instruction, it is better to just factor them into
|
||||
; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the
|
||||
; offsets are int32/64 vectors.
|
||||
;
|
||||
; varying int{8,16,32,float,64,double}
|
||||
; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base,
|
||||
; uniform int32 offset_scale, int{32,64} offsets, mask)
|
||||
|
||||
declare <WIDTH x i8> @__pseudo_gather_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16> @__pseudo_gather_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32> @__pseudo_gather_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float> @__pseudo_gather_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64> @__pseudo_gather_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8>
|
||||
@__pseudo_gather_factored_base_offsets32_i8(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16>
|
||||
@__pseudo_gather_factored_base_offsets32_i16(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32>
|
||||
@__pseudo_gather_factored_base_offsets32_i32(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float>
|
||||
@__pseudo_gather_factored_base_offsets32_float(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64>
|
||||
@__pseudo_gather_factored_base_offsets32_i64(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double>
|
||||
@__pseudo_gather_factored_base_offsets32_double(i8 *, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8>
|
||||
@__pseudo_gather_factored_base_offsets64_i8(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16>
|
||||
@__pseudo_gather_factored_base_offsets64_i16(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32>
|
||||
@__pseudo_gather_factored_base_offsets64_i32(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float>
|
||||
@__pseudo_gather_factored_base_offsets64_float(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64>
|
||||
@__pseudo_gather_factored_base_offsets64_i64(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double>
|
||||
@__pseudo_gather_factored_base_offsets64_double(i8 *, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets32_i8(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets32_i16(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets32_i32(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets32_float(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets32_i64(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets32_double(i8 *, i32, <WIDTH x i32>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
declare <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets64_i8(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets64_i16(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets64_i32(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets64_float(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets64_i64(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
declare <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets64_double(i8 *, i32, <WIDTH x i64>,
|
||||
<WIDTH x MASK>) nounwind readonly
|
||||
|
||||
; Similarly to the pseudo-gathers defined above, we also declare undefined
|
||||
; pseudo-scatter instructions with signatures:
|
||||
@@ -1657,16 +1717,6 @@ declare <WIDTH x double> @__pseudo_gather_base_offsets64_double(i8 *, <WIDTH x i
|
||||
; void __pseudo_scatter_i64(varying int64 *, varying int64 values, mask)
|
||||
; void __pseudo_scatter_double(varying double *, varying double values, mask)
|
||||
;
|
||||
; The GatherScatterFlattenOpt optimization pass also finds these and
|
||||
; transforms them to scatters like:
|
||||
;
|
||||
; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base,
|
||||
; varying int32 offsets, uniform int32 offset_scale,
|
||||
; varying int{32,64} offset_delta, varying int8 values, mask)
|
||||
; (and similarly for 16/32/64 bit values)
|
||||
;
|
||||
; And the GSImprovementsPass in turn converts these to actual native
|
||||
; scatters or masked stores.
|
||||
|
||||
declare void @__pseudo_scatter32_i8(<WIDTH x i32>, <WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter32_i16(<WIDTH x i32>, <WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
@@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(<WIDTH x i64>, <WIDTH x float>, <WIDTH x
|
||||
declare void @__pseudo_scatter64_i64(<WIDTH x i64>, <WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter64_double(<WIDTH x i64>, <WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
; And the ImproveMemoryOps optimization pass also finds these and
|
||||
; either transforms them to scatters like:
|
||||
;
|
||||
; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base,
|
||||
; varying int32 offsets, uniform int32 offset_scale,
|
||||
; varying int{32,64} offset_delta, varying int8 values, mask)
|
||||
; (and similarly for 16/32/64 bit values)
|
||||
;
|
||||
; Or, if the target has a native scatter instruction:
|
||||
;
|
||||
; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base,
|
||||
; uniform int32 offset_scale, varying int{32,64} offsets,
|
||||
; varying int8 values, mask)
|
||||
; (and similarly for 16/32/64 bit values)
|
||||
|
||||
declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, <WIDTH x i32>, i32, <WIDTH x i32>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, <WIDTH x i64>, i32, <WIDTH x i64>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, <WIDTH x i32>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i8>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i16>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i32>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x float>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x i64>, <WIDTH x MASK>) nounwind
|
||||
declare void
|
||||
@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, <WIDTH x i64>,
|
||||
<WIDTH x double>, <WIDTH x MASK>) nounwind
|
||||
|
||||
declare float @__log_uniform_float(float) nounwind readnone
|
||||
declare <WIDTH x float> @__log_varying_float(<WIDTH x float>) nounwind readnone
|
||||
@@ -1834,143 +1949,246 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
|
||||
call void @__usedouble(<WIDTH x double> %pg64_d)
|
||||
|
||||
%g32_8 = call <WIDTH x i8> @__gather32_i8(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %g32_8)
|
||||
%g32_16 = call <WIDTH x i16> @__gather32_i16(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %g32_16)
|
||||
%g32_32 = call <WIDTH x i32> @__gather32_i32(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %g32_32)
|
||||
%g32_f = call <WIDTH x float> @__gather32_float(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %g32_f)
|
||||
%g32_64 = call <WIDTH x i64> @__gather32_i64(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %g32_64)
|
||||
%g32_d = call <WIDTH x double> @__gather32_double(<WIDTH x i32> %v32,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %g32_d)
|
||||
|
||||
%g64_8 = call <WIDTH x i8> @__gather64_i8(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %g64_8)
|
||||
%g64_16 = call <WIDTH x i16> @__gather64_i16(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %g64_16)
|
||||
%g64_32 = call <WIDTH x i32> @__gather64_i32(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %g64_32)
|
||||
%g64_f = call <WIDTH x float> @__gather64_float(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %g64_f)
|
||||
%g64_64 = call <WIDTH x i64> @__gather64_i64(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %g64_64)
|
||||
%g64_d = call <WIDTH x double> @__gather64_double(<WIDTH x i64> %v64,
|
||||
<WIDTH x MASK> %mask)
|
||||
<WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %g64_d)
|
||||
|
||||
ifelse(HAVE_GATHER, `1',
|
||||
`
|
||||
%nfpgbo32_8 = call <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %nfpgbo32_8)
|
||||
%nfpgbo32_16 = call <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %nfpgbo32_16)
|
||||
%nfpgbo32_32 = call <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %nfpgbo32_32)
|
||||
%nfpgbo32_f = call <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %nfpgbo32_f)
|
||||
%nfpgbo32_64 = call <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %nfpgbo32_64)
|
||||
%nfpgbo32_d = call <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %nfpgbo32_d)
|
||||
|
||||
%nfpgbo64_8 = call <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %nfpgbo64_8)
|
||||
%nfpgbo64_16 = call <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %nfpgbo64_16)
|
||||
%nfpgbo64_32 = call <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %nfpgbo64_32)
|
||||
%nfpgbo64_f = call <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %nfpgbo64_f)
|
||||
%nfpgbo64_64 = call <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %nfpgbo64_64)
|
||||
%nfpgbo64_d = call <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %nfpgbo64_d)
|
||||
|
||||
%nfgbo32_8 = call <WIDTH x i8>
|
||||
@__gather_base_offsets32_i8(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %nfgbo32_8)
|
||||
%nfgbo32_16 = call <WIDTH x i16>
|
||||
@__gather_base_offsets32_i16(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %nfgbo32_16)
|
||||
%nfgbo32_32 = call <WIDTH x i32>
|
||||
@__gather_base_offsets32_i32(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %nfgbo32_32)
|
||||
%nfgbo32_f = call <WIDTH x float>
|
||||
@__gather_base_offsets32_float(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %nfgbo32_f)
|
||||
%nfgbo32_64 = call <WIDTH x i64>
|
||||
@__gather_base_offsets32_i64(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %nfgbo32_64)
|
||||
%nfgbo32_d = call <WIDTH x double>
|
||||
@__gather_base_offsets32_double(i8 * %ptr, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %nfgbo32_d)
|
||||
|
||||
%nfgbo64_8 = call <WIDTH x i8>
|
||||
@__gather_base_offsets64_i8(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %nfgbo64_8)
|
||||
%nfgbo64_16 = call <WIDTH x i16>
|
||||
@__gather_base_offsets64_i16(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %nfgbo64_16)
|
||||
%nfgbo64_32 = call <WIDTH x i32>
|
||||
@__gather_base_offsets64_i32(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %nfgbo64_32)
|
||||
%nfgbo64_f = call <WIDTH x float>
|
||||
@__gather_base_offsets64_float(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %nfgbo64_f)
|
||||
%nfgbo64_64 = call <WIDTH x i64>
|
||||
@__gather_base_offsets64_i64(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %nfgbo64_64)
|
||||
%nfgbo64_d = call <WIDTH x double>
|
||||
@__gather_base_offsets64_double(i8 * %ptr, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %nfgbo64_d)
|
||||
',
|
||||
`
|
||||
%pgbo32_8 = call <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %pgbo32_8)
|
||||
%pgbo32_16 = call <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %pgbo32_16)
|
||||
%pgbo32_32 = call <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %pgbo32_32)
|
||||
%pgbo32_f = call <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %pgbo32_f)
|
||||
%pgbo32_64 = call <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %pgbo32_64)
|
||||
%pgbo32_d = call <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %pgbo32_d)
|
||||
|
||||
%gbo32_8 = call <WIDTH x i8>
|
||||
@__gather_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %gbo32_8)
|
||||
%gbo32_16 = call <WIDTH x i16>
|
||||
@__gather_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %gbo32_16)
|
||||
%gbo32_32 = call <WIDTH x i32>
|
||||
@__gather_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %gbo32_32)
|
||||
%gbo32_f = call <WIDTH x float>
|
||||
@__gather_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %gbo32_f)
|
||||
%gbo32_64 = call <WIDTH x i64>
|
||||
@__gather_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %gbo32_64)
|
||||
%gbo32_d = call <WIDTH x double>
|
||||
@__gather_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %gbo32_d)
|
||||
|
||||
|
||||
%pgbo64_8 = call <WIDTH x i8>
|
||||
@__pseudo_gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %pgbo64_8)
|
||||
%pgbo64_16 = call <WIDTH x i16>
|
||||
@__pseudo_gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %pgbo64_16)
|
||||
%pgbo64_32 = call <WIDTH x i32>
|
||||
@__pseudo_gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %pgbo64_32)
|
||||
%pgbo64_f = call <WIDTH x float>
|
||||
@__pseudo_gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %pgbo64_f)
|
||||
%pgbo64_64 = call <WIDTH x i64>
|
||||
@__pseudo_gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %pgbo64_64)
|
||||
%pgbo64_d = call <WIDTH x double>
|
||||
@__pseudo_gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
@__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %pgbo64_d)
|
||||
|
||||
%gbo32_8 = call <WIDTH x i8>
|
||||
@__gather_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %gbo32_8)
|
||||
%gbo32_16 = call <WIDTH x i16>
|
||||
@__gather_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %gbo32_16)
|
||||
%gbo32_32 = call <WIDTH x i32>
|
||||
@__gather_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %gbo32_32)
|
||||
%gbo32_f = call <WIDTH x float>
|
||||
@__gather_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %gbo32_f)
|
||||
%gbo32_64 = call <WIDTH x i64>
|
||||
@__gather_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %gbo32_64)
|
||||
%gbo32_d = call <WIDTH x double>
|
||||
@__gather_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %gbo32_d)
|
||||
|
||||
%gbo64_8 = call <WIDTH x i8>
|
||||
@__gather_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
@__gather_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use8(<WIDTH x i8> %gbo64_8)
|
||||
%gbo64_16 = call <WIDTH x i16>
|
||||
@__gather_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
@__gather_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use16(<WIDTH x i16> %gbo64_16)
|
||||
%gbo64_32 = call <WIDTH x i32>
|
||||
@__gather_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
@__gather_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use32(<WIDTH x i32> %gbo64_32)
|
||||
%gbo64_f = call <WIDTH x float>
|
||||
@__gather_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
@__gather_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usefloat(<WIDTH x float> %gbo64_f)
|
||||
%gbo64_64 = call <WIDTH x i64>
|
||||
@__gather_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
@__gather_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__use64(<WIDTH x i64> %gbo64_64)
|
||||
%gbo64_d = call <WIDTH x double>
|
||||
@__gather_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %gbo64_d)
|
||||
@__gather_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__usedouble(<WIDTH x double> %pgbo64_d)
|
||||
')
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; scatters
|
||||
@@ -2003,61 +2221,118 @@ define void @__keep_funcs_live(i8 * %ptr, <WIDTH x i8> %v8, <WIDTH x i16> %v16,
|
||||
call void @__scatter64_i64(<WIDTH x i64> %v64, <WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter64_double(<WIDTH x i64> %v64, <WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
ifelse(HAVE_SCATTER, `1',
|
||||
`
|
||||
call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__scatter_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__scatter_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
',
|
||||
`
|
||||
call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets32_float(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets32_double(i8 * %ptr, <WIDTH x i32> %v32, i32 0, <WIDTH x i32> %v32,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
|
||||
call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i8> %v8, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i16> %v16, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i32> %v32, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets64_float(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x float> %vf, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x i64> %v64, <WIDTH x MASK> %mask)
|
||||
call void @__scatter_factored_base_offsets64_double(i8 * %ptr, <WIDTH x i64> %v64, i32 0, <WIDTH x i64> %v64,
|
||||
<WIDTH x double> %vd, <WIDTH x MASK> %mask)
|
||||
')
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; vector ops
|
||||
|
||||
@@ -3196,8 +3471,42 @@ pl_done:
|
||||
;;
|
||||
;; $1: scalar type for which to generate functions to do gathers
|
||||
|
||||
define(`gen_gather_general', `
|
||||
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
|
||||
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
|
||||
')
|
||||
|
||||
%ret = load <WIDTH x $1> * %ret_ptr
|
||||
ret <WIDTH x $1> %ret
|
||||
}
|
||||
|
||||
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
|
||||
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
|
||||
')
|
||||
|
||||
%ret = load <WIDTH x $1> * %ret_ptr
|
||||
ret <WIDTH x $1> %ret
|
||||
}
|
||||
')
|
||||
|
||||
; vec width, type
|
||||
define(`gen_gather', `
|
||||
define(`gen_gather_factored', `
|
||||
;; Define the utility function to do the gather operation for a single element
|
||||
;; of the type
|
||||
define <WIDTH x $1> @__gather_elt32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
@@ -3245,7 +3554,7 @@ define <WIDTH x $1> @__gather_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %o
|
||||
}
|
||||
|
||||
|
||||
define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
define <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i32> %offset_delta,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
@@ -3276,7 +3585,7 @@ define <WIDTH x $1> @__gather_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %offset
|
||||
ret <WIDTH x $1> %ret`'eval(WIDTH-1)
|
||||
}
|
||||
|
||||
define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
define <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i64> %offset_delta,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
; We can be clever and avoid the per-lane stuff for gathers if we are willing
|
||||
@@ -3307,37 +3616,42 @@ define <WIDTH x $1> @__gather_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %offset
|
||||
ret <WIDTH x $1> %ret`'eval(WIDTH-1)
|
||||
}
|
||||
|
||||
; fully general 32-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather32_$1(<WIDTH x i32> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i32> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
|
||||
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
|
||||
')
|
||||
gen_gather_general($1)
|
||||
'
|
||||
)
|
||||
|
||||
%ret = load <WIDTH x $1> * %ret_ptr
|
||||
ret <WIDTH x $1> %ret
|
||||
; vec width, type
|
||||
define(`gen_gather', `
|
||||
|
||||
gen_gather_factored($1)
|
||||
|
||||
define <WIDTH x $1>
|
||||
@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale,
|
||||
<WIDTH x i32> %offsets,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale_vec = bitcast i32 %offset_scale to <1 x i32>
|
||||
%smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef,
|
||||
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
|
||||
%scaled_offsets = mul <WIDTH x i32> %smear_scale, %offsets
|
||||
%v = call <WIDTH x $1> @__gather_factored_base_offsets32_$1(i8 * %ptr, <WIDTH x i32> %scaled_offsets, i32 1,
|
||||
<WIDTH x i32> zeroinitializer, <WIDTH x i32> %vecmask)
|
||||
ret <WIDTH x $1> %v
|
||||
}
|
||||
|
||||
; fully general 64-bit gather, takes array of pointers encoded as vector of i32s
|
||||
define <WIDTH x $1> @__gather64_$1(<WIDTH x i64> %ptrs,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%ret_ptr = alloca <WIDTH x $1>
|
||||
per_lane(WIDTH, <WIDTH x i32> %vecmask, `
|
||||
%iptr_LANE_ID = extractelement <WIDTH x i64> %ptrs, i32 LANE
|
||||
%ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 *
|
||||
%val_LANE_ID = load $1 * %ptr_LANE_ID
|
||||
%store_ptr_LANE_ID = getelementptr <WIDTH x $1> * %ret_ptr, i32 0, i32 LANE
|
||||
store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID
|
||||
')
|
||||
|
||||
%ret = load <WIDTH x $1> * %ret_ptr
|
||||
ret <WIDTH x $1> %ret
|
||||
define <WIDTH x $1>
|
||||
@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale,
|
||||
<WIDTH x i64> %offsets,
|
||||
<WIDTH x i32> %vecmask) nounwind readonly alwaysinline {
|
||||
%scale64 = zext i32 %offset_scale to i64
|
||||
%scale_vec = bitcast i64 %scale64 to <1 x i64>
|
||||
%smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef,
|
||||
<WIDTH x i32> < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 >
|
||||
%scaled_offsets = mul <WIDTH x i64> %smear_scale, %offsets
|
||||
%v = call <WIDTH x $1> @__gather_factored_base_offsets64_$1(i8 * %ptr, <WIDTH x i64> %scaled_offsets,
|
||||
i32 1, <WIDTH x i64> zeroinitializer, <WIDTH x i32> %vecmask)
|
||||
ret <WIDTH x $1> %v
|
||||
}
|
||||
|
||||
'
|
||||
)
|
||||
|
||||
@@ -3391,7 +3705,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, <WIDTH x i64> %offsets, i32 %offset_s
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
define void @__scatter_factored_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i32> %offset_delta, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
@@ -3401,7 +3715,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, <WIDTH x i32> %offsets, i32
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @__scatter_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
define void @__scatter_factored_base_offsets64_$1(i8* %base, <WIDTH x i64> %offsets, i32 %offset_scale,
|
||||
<WIDTH x i64> %offset_delta, <WIDTH x $1> %values,
|
||||
<WIDTH x i32> %mask) nounwind alwaysinline {
|
||||
;; And use the `per_lane' macro to do all of the per-lane work for scatter...
|
||||
@@ -3437,3 +3751,48 @@ define void @__scatter64_$1(<WIDTH x i64> %ptrs, <WIDTH x $1> %values,
|
||||
|
||||
'
|
||||
)
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rdrand
|
||||
|
||||
define(`rdrand_decls', `
|
||||
declare i1 @__rdrand_i16(i16 * nocapture)
|
||||
declare i1 @__rdrand_i32(i32 * nocapture)
|
||||
declare i1 @__rdrand_i64(i64 * nocapture)
|
||||
')
|
||||
|
||||
define(`rdrand_definition', `
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;; rdrand
|
||||
|
||||
declare {i16, i32} @llvm.x86.rdrand.16()
|
||||
declare {i32, i32} @llvm.x86.rdrand.32()
|
||||
declare {i64, i32} @llvm.x86.rdrand.64()
|
||||
|
||||
define i1 @__rdrand_i16(i16 * %ptr) {
|
||||
%v = call {i16, i32} @llvm.x86.rdrand.16()
|
||||
%v0 = extractvalue {i16, i32} %v, 0
|
||||
%v1 = extractvalue {i16, i32} %v, 1
|
||||
store i16 %v0, i16 * %ptr
|
||||
%good = icmp ne i32 %v1, 0
|
||||
ret i1 %good
|
||||
}
|
||||
|
||||
define i1 @__rdrand_i32(i32 * %ptr) {
|
||||
%v = call {i32, i32} @llvm.x86.rdrand.32()
|
||||
%v0 = extractvalue {i32, i32} %v, 0
|
||||
%v1 = extractvalue {i32, i32} %v, 1
|
||||
store i32 %v0, i32 * %ptr
|
||||
%good = icmp ne i32 %v1, 0
|
||||
ret i1 %good
|
||||
}
|
||||
|
||||
define i1 @__rdrand_i64(i64 * %ptr) {
|
||||
%v = call {i64, i32} @llvm.x86.rdrand.64()
|
||||
%v0 = extractvalue {i64, i32} %v, 0
|
||||
%v1 = extractvalue {i64, i32} %v, 1
|
||||
store i64 %v0, i64 * %ptr
|
||||
%good = icmp ne i32 %v1, 0
|
||||
ret i1 %good
|
||||
}
|
||||
')
|
||||
|
||||
@@ -140,6 +140,7 @@ Contents:
|
||||
* `Basic Math Functions`_
|
||||
* `Transcendental Functions`_
|
||||
* `Pseudo-Random Numbers`_
|
||||
* `Random Numbers`_
|
||||
|
||||
+ `Output Functions`_
|
||||
+ `Assertions`_
|
||||
@@ -2084,7 +2085,7 @@ can be declared:
|
||||
|
||||
soa<8> Point pts[...];
|
||||
|
||||
The in-memory layout of the ``Point``s has had the SOA transformation
|
||||
The in-memory layout of the ``Point`` instances has had the SOA transformation
|
||||
applied, such that there are 8 ``x`` values in memory followed by 8 ``y``
|
||||
values, and so forth. Here is the effective declaration of ``soa<8>
|
||||
Point``:
|
||||
@@ -2266,7 +2267,7 @@ based on C++'s ``new`` and ``delete`` operators:
|
||||
// use ptr...
|
||||
delete[] ptr;
|
||||
|
||||
In the above code, each program instance allocates its own ``count`-sized
|
||||
In the above code, each program instance allocates its own ``count`` sized
|
||||
array of ``uniform int`` values, uses that memory, and then deallocates
|
||||
that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are
|
||||
serviced by corresponding calls the system C library's ``malloc()`` and
|
||||
@@ -2277,9 +2278,7 @@ analogous to the corresponding rules are for pointers (as described in
|
||||
`Pointer Types`_.) Specifically, if a specific rate qualifier isn't
|
||||
provided with the ``new`` expression, then the default is that a "varying"
|
||||
``new`` is performed, where each program instance performs a unique
|
||||
allocation. The allocated type, in turn, is by default ``uniform`` for
|
||||
``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new
|
||||
expressions.
|
||||
allocation. The allocated type, in turn, is by default ``uniform``.
|
||||
|
||||
After a pointer has been deleted, it is illegal to access the memory it
|
||||
points to. However, that deletion happens on a per-program-instance basis.
|
||||
@@ -3457,6 +3456,40 @@ be used to get a pseudo-random ``float`` value.
|
||||
uniform unsigned int32 random(RNGState * uniform state)
|
||||
uniform float frandom(uniform RNGState * uniform state)
|
||||
|
||||
|
||||
Random Numbers
|
||||
--------------
|
||||
|
||||
Some recent CPUs (including those based on the Intel(r) Ivy Bridge
|
||||
micro-architecture), provide support for generating true random numbers. A
|
||||
few standard library functions make this functionality available:
|
||||
|
||||
::
|
||||
|
||||
bool rdrand(uniform int32 * uniform ptr)
|
||||
bool rdrand(varying int32 * uniform ptr)
|
||||
bool rdrand(uniform int32 * varying ptr)
|
||||
|
||||
If the processor doesn't have sufficient entropy to generate a random
|
||||
number, then this function fails and returns ``false``. Otherwise, if the
|
||||
processor is successful, the random value is stored in the given pointer
|
||||
and ``true`` is returned. Therefore, this function should generally be
|
||||
used as follows, called repeatedly until it is successful:
|
||||
|
||||
::
|
||||
|
||||
int r;
|
||||
while (rdrand(&r) == false)
|
||||
; // empty loop body
|
||||
|
||||
|
||||
In addition to the ``int32`` variants of ``rdrand()`` listed above, there
|
||||
are versions that return ``int16``, ``float``, and ``int64`` values as
|
||||
well.
|
||||
|
||||
Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the
|
||||
``rdrand()`` functions always return ``false``.
|
||||
|
||||
Output Functions
|
||||
----------------
|
||||
|
||||
@@ -3491,7 +3524,7 @@ generates the following output on a four-wide compilation target:
|
||||
::
|
||||
|
||||
i = 10, x = [0.000000,1.000000,2.000000,3.000000]
|
||||
added to x = [1.000000,2.000000,((2.000000)),((3.000000)]
|
||||
added to x = [1.000000,2.000000,((2.000000)),((3.000000))]
|
||||
last print of x = [1.000000,2.000000,2.000000,3.000000]
|
||||
|
||||
When a varying variable is printed, the values for program instances that
|
||||
@@ -4010,8 +4043,8 @@ Systems Programming Support
|
||||
Atomic Operations and Memory Fences
|
||||
-----------------------------------
|
||||
|
||||
The standard range of atomic memory operations are provided by the standard
|
||||
library``ispc``, including variants to handle both uniform and varying
|
||||
The standard set of atomic memory operations are provided by the standard
|
||||
library, including variants to handle both uniform and varying
|
||||
types as well as "local" and "global" atomics.
|
||||
|
||||
Local atomics provide atomic behavior across the program instances in a
|
||||
|
||||
@@ -1307,15 +1307,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val,
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, __vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1362,14 +1360,13 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double)
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, VTYPE val, \
|
||||
__vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -1375,15 +1375,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val,
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec32_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, __vec32_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 32; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1430,14 +1428,12 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double)
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec32_i1 mask) { \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, VTYPE val, __vec32_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 32; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -1508,15 +1508,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val,
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec64_i1 mask) { \
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, __vec64_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 64; ++i) \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
@@ -1540,7 +1538,7 @@ GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_doub
|
||||
static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
for (int i = 0; i < 64; ++i) \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)ptrs.v[i]; \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
@@ -1563,14 +1561,12 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double)
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec64_i1 mask) { \
|
||||
static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \
|
||||
OTYPE offset, VTYPE val, __vec64_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 64; ++i) \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
if ((mask.v & (1ull << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
|
||||
@@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) {
|
||||
// int64
|
||||
|
||||
|
||||
static FORCEINLINE __vec16_i64 __setzero_i64() {
|
||||
__vec16_i64 ret;
|
||||
ret.v_lo = _mm512_setzero_epi32();
|
||||
ret.v_hi = _mm512_setzero_epi32();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b)
|
||||
{
|
||||
__mmask16 carry = 0;
|
||||
@@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index)
|
||||
return src[index+16] | (int64_t(src[index]) << 32);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) {
|
||||
static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) {
|
||||
const int *i = (const int*)&l;
|
||||
return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1]));
|
||||
}
|
||||
@@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext)
|
||||
CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext)
|
||||
CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||
}
|
||||
|
||||
#define CAST_SEXT_I1(TYPE)
|
||||
/*
|
||||
static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \
|
||||
@@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8)
|
||||
CAST_SEXT_I1(__vec16_i16)
|
||||
CAST_SEXT_I1(__vec16_i32)
|
||||
|
||||
static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val)
|
||||
{
|
||||
return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31));
|
||||
}
|
||||
|
||||
// zero extension
|
||||
CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext)
|
||||
CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext)
|
||||
@@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16)
|
||||
CAST_ZEXT_I1(__vec16_i32)
|
||||
CAST_ZEXT_I1(__vec16_i64)
|
||||
|
||||
static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val)
|
||||
{
|
||||
__vec16_i32 ret = _mm512_setzero_epi32();
|
||||
__vec16_i32 one = _mm512_set1_epi32(1);
|
||||
return _mm512_mask_mov_epi32(ret, val.m, one);
|
||||
}
|
||||
|
||||
|
||||
// truncations
|
||||
CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc)
|
||||
CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc)
|
||||
@@ -1589,11 +1604,6 @@ CAST_BITS_SCALAR(double, int64_t)
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// various math functions
|
||||
|
||||
/*
|
||||
static FORCEINLINE void __fastmath() {
|
||||
}
|
||||
*/
|
||||
|
||||
static FORCEINLINE float __round_uniform_float(float v) {
|
||||
return roundf(v);
|
||||
}
|
||||
@@ -1659,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) {
|
||||
return _mm512_gmin_ps(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_max_epi32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_min_epi32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_max_epu32(v1, v2);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) {
|
||||
return _mm512_min_epu32(v1, v2);
|
||||
}
|
||||
|
||||
BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double)
|
||||
BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double)
|
||||
|
||||
BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32)
|
||||
BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32)
|
||||
|
||||
BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64)
|
||||
BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64)
|
||||
BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64)
|
||||
@@ -1940,60 +1961,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val,
|
||||
|
||||
// offsets * offsetScale is in bytes (for all of these)
|
||||
|
||||
#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
__vec16_i1 mask) { \
|
||||
VTYPE ret; \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
ret.v[i] = *ptr; \
|
||||
} \
|
||||
return ret; \
|
||||
}
|
||||
*/
|
||||
|
||||
static FORCEINLINE __vec16_i32
|
||||
__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
||||
__vec16_i32 tmp;
|
||||
|
||||
// Loop is generated by intrinsic
|
||||
__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 tmp = _mm512_undefined_epi32();
|
||||
__vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_EPI32_NONE, 1,
|
||||
_MM_UPCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec16_f
|
||||
__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i1 mask) {
|
||||
__vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
|
||||
__vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset);
|
||||
__vec16_f tmp;
|
||||
|
||||
// Loop is generated by intrinsic
|
||||
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_PS_NONE, 1,
|
||||
__vec16_f tmp = _mm512_undefined_ps();
|
||||
__vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base,
|
||||
_MM_UPCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
return ret;
|
||||
}
|
||||
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64)
|
||||
//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64)
|
||||
|
||||
#define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
@@ -2039,47 +2033,43 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask)
|
||||
*/
|
||||
// scatter
|
||||
|
||||
#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \
|
||||
uint32_t scale, OTYPE constOffset, \
|
||||
VTYPE val, __vec16_i1 mask) { \
|
||||
int8_t *base = (int8_t *)b; \
|
||||
for (int i = 0; i < 16; ++i) \
|
||||
if ((mask.v & (1 << i)) != 0) { \
|
||||
STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \
|
||||
constOffset.v[i]); \
|
||||
*ptr = val.v[i]; \
|
||||
} \
|
||||
}
|
||||
*/
|
||||
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64)
|
||||
//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64)
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset,
|
||||
uint32_t scale, __vec16_i32 constOffset,
|
||||
__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_i32 val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset);
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE);
|
||||
_mm512_mask_i32extscatter_epi32(b, mask, offsets, val,
|
||||
_MM_DOWNCONV_EPI32_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i32 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets,
|
||||
__vec16_f val, __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset);
|
||||
_mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||
_mm512_mask_i32extscatter_ps(base, mask, offsets, val,
|
||||
_MM_DOWNCONV_PS_NONE, scale,
|
||||
_MM_HINT_NONE);
|
||||
}
|
||||
|
||||
/*
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset,
|
||||
uint32_t scale, const __vec16_i64 &constOffset,
|
||||
const __vec16_f &val, const __vec16_i1 mask)
|
||||
{
|
||||
__vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset);
|
||||
_mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE);
|
||||
}
|
||||
*/
|
||||
|
||||
#define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC)
|
||||
/*
|
||||
static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \
|
||||
|
||||
@@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val,
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
||||
__vec4_i32 offsets, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __setzero_i32());
|
||||
constOffset = __select(mask, constOffset, __setzero_i32());
|
||||
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3);
|
||||
int offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets,
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
static FORCEINLINE RetVec
|
||||
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale,
|
||||
__vec4_i64 offsets, __vec4_i1 mask) {
|
||||
RetScalar r[4];
|
||||
#if 1
|
||||
// "Fast gather" trick...
|
||||
offsets = __select(mask, offsets, __setzero_i64());
|
||||
constOffset = __select(mask, constOffset, __setzero_i64());
|
||||
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
#else
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[0] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[1] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[2] = *ptr;
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1);
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
RetScalar *ptr = (RetScalar *)(p + offset);
|
||||
r[3] = *ptr;
|
||||
}
|
||||
@@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets,
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i8
|
||||
__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i16
|
||||
__gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i32
|
||||
__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f
|
||||
__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale,
|
||||
__vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_f
|
||||
__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_i64
|
||||
__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_d
|
||||
__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE __vec4_d
|
||||
__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale,
|
||||
constOffset, mask);
|
||||
__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i1 mask) {
|
||||
return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask);
|
||||
}
|
||||
|
||||
template<typename RetVec, typename RetScalar>
|
||||
@@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) {
|
||||
|
||||
#define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \
|
||||
uint32_t scale, __vec4_i32 constOffset, \
|
||||
__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \
|
||||
__vec4_i32 offsets, \
|
||||
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \
|
||||
_mm_extract_epi32(constOffset.v, 0)); \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \
|
||||
_mm_extract_epi32(constOffset.v, 1)); \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \
|
||||
_mm_extract_epi32(constOffset.v, 2)); \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \
|
||||
_mm_extract_epi32(constOffset.v, 3)); \
|
||||
TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
} \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \
|
||||
uint32_t scale, __vec4_i64 constOffset, \
|
||||
static FORCEINLINE void \
|
||||
__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \
|
||||
__vec4_i64 offsets, \
|
||||
__vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 0); \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 0); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 1); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[0], 1); \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 1); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 2); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 0); \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 2); \
|
||||
} \
|
||||
m = _mm_extract_ps(mask.v, 3); \
|
||||
if (m != 0) { \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \
|
||||
_mm_extract_epi64(constOffset.v[1], 1); \
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \
|
||||
TYPE *ptr = (TYPE *)(p + offset); \
|
||||
*ptr = EXTRACT(val.v, 3); \
|
||||
} \
|
||||
@@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float)
|
||||
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val,
|
||||
__vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) +
|
||||
_mm_extract_epi32(constOffset.v, 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) +
|
||||
_mm_extract_epi32(constOffset.v, 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) +
|
||||
_mm_extract_epi32(constOffset.v, 2);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) +
|
||||
_mm_extract_epi32(constOffset.v, 3);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset,
|
||||
__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) +
|
||||
_mm_extract_epi64(constOffset.v[0], 0);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) +
|
||||
_mm_extract_epi64(constOffset.v[0], 1);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) +
|
||||
_mm_extract_epi64(constOffset.v[1], 0);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 2);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) +
|
||||
_mm_extract_epi64(constOffset.v[1], 1);
|
||||
int32_t offset = scale * _mm_extract_epi32(offsets.v, 3);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets,
|
||||
uint32_t scale, __vec4_i32 constOffset, __vec4_d val,
|
||||
__vec4_i1 mask) {
|
||||
__scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask);
|
||||
__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_i64 val, __vec4_i1 mask) {
|
||||
uint32_t m = _mm_extract_ps(mask.v, 0);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 1);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[0], 1);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 2);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 0);
|
||||
}
|
||||
|
||||
m = _mm_extract_ps(mask.v, 3);
|
||||
if (m != 0) {
|
||||
int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1);
|
||||
uint64_t *ptr = (uint64_t *)(p + offset);
|
||||
*ptr = _mm_extract_epi64(val.v[1], 1);
|
||||
}
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets,
|
||||
uint32_t scale, __vec4_i64 constOffset, __vec4_d val,
|
||||
__vec4_i1 mask) {
|
||||
__scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask);
|
||||
__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets,
|
||||
__vec4_d val, __vec4_i1 mask) {
|
||||
__scatter_base_offsets32_i64(p, scale, offsets, val, mask);
|
||||
}
|
||||
|
||||
static FORCEINLINE void
|
||||
__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets,
|
||||
__vec4_d val, __vec4_i1 mask) {
|
||||
__scatter_base_offsets64_i64(p, scale, offsets, val, mask);
|
||||
}
|
||||
|
||||
|
||||
|
||||
52
ispc.cpp
52
ispc.cpp
@@ -94,20 +94,22 @@ lGetSystemISA() {
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
|
||||
if ((info[2] & (1 << 28)) != 0) {
|
||||
// AVX1 for sure. Do we have AVX2?
|
||||
// Call cpuid with eax=7, ecx=0
|
||||
__cpuidex(info, 7, 0);
|
||||
if ((info[1] & (1 << 5)) != 0)
|
||||
return "avx2";
|
||||
else {
|
||||
// ivybridge?
|
||||
if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
(info[2] & (1 << 30)) != 0) // RDRAND
|
||||
return "avx1.1";
|
||||
if ((info[2] & (1 << 28)) != 0) { // AVX
|
||||
// AVX1 for sure....
|
||||
// Ivy Bridge?
|
||||
if ((info[2] & (1 << 29)) != 0 && // F16C
|
||||
(info[2] & (1 << 30)) != 0) { // RDRAND
|
||||
// So far, so good. AVX2?
|
||||
// Call cpuid with eax=7, ecx=0
|
||||
int info2[4];
|
||||
__cpuidex(info2, 7, 0);
|
||||
if ((info2[1] & (1 << 5)) != 0)
|
||||
return "avx2";
|
||||
else
|
||||
return "avx";
|
||||
return "avx1.1";
|
||||
}
|
||||
// Regular AVX
|
||||
return "avx";
|
||||
}
|
||||
else if ((info[2] & (1 << 19)) != 0)
|
||||
return "sse4";
|
||||
@@ -212,6 +214,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
|
||||
// This is the case for most of them
|
||||
t->hasHalf = t->hasRand = t->hasTranscendentals = false;
|
||||
t->hasGather = t->hasScatter = false;
|
||||
|
||||
if (!strcasecmp(isa, "sse2")) {
|
||||
t->isa = Target::SSE2;
|
||||
@@ -253,6 +256,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-8")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -262,6 +266,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-16")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -271,6 +276,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-32")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -280,6 +286,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-64")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -289,6 +296,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskBitCount = 1;
|
||||
t->hasHalf = true;
|
||||
t->hasTranscendentals = true;
|
||||
t->hasGather = t->hasScatter = true;
|
||||
}
|
||||
else if (!strcasecmp(isa, "generic-1")) {
|
||||
t->isa = Target::GENERIC;
|
||||
@@ -320,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
#if !defined(LLVM_3_0)
|
||||
// LLVM 3.1+ only
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx1.1-x2")) {
|
||||
t->isa = Target::AVX11;
|
||||
@@ -330,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand";
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
#if !defined(LLVM_3_0)
|
||||
// LLVM 3.1+ only
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#ifndef LLVM_3_0
|
||||
else if (!strcasecmp(isa, "avx2")) {
|
||||
@@ -342,7 +362,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
t->hasGather = true;
|
||||
#endif
|
||||
}
|
||||
else if (!strcasecmp(isa, "avx2-x2")) {
|
||||
t->isa = Target::AVX2;
|
||||
@@ -352,7 +376,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa,
|
||||
t->maskingIsFree = false;
|
||||
t->maskBitCount = 32;
|
||||
t->hasHalf = true;
|
||||
#if !defined(LLVM_3_1)
|
||||
// LLVM 3.2+ only
|
||||
t->hasRand = true;
|
||||
t->hasGather = true;
|
||||
#endif
|
||||
}
|
||||
#endif // !LLVM_3_0
|
||||
else {
|
||||
|
||||
8
ispc.h
8
ispc.h
@@ -252,9 +252,15 @@ struct Target {
|
||||
conversions. */
|
||||
bool hasHalf;
|
||||
|
||||
/** Indicates whether there is an ISA random number instruciton. */
|
||||
/** Indicates whether there is an ISA random number instruction. */
|
||||
bool hasRand;
|
||||
|
||||
/** Indicates whether the target has a native gather instruction */
|
||||
bool hasGather;
|
||||
|
||||
/** Indicates whether the target has a native scatter instruction */
|
||||
bool hasScatter;
|
||||
|
||||
/** Indicates whether the target has support for transcendentals (beyond
|
||||
sqrt, which we assume that all of them handle). */
|
||||
bool hasTranscendentals;
|
||||
|
||||
@@ -265,11 +265,9 @@ def run_test(filename):
|
||||
gcc_arch = '-m64'
|
||||
|
||||
gcc_isa=""
|
||||
if options.target == 'sse2' or options.target == 'sse2-x2':
|
||||
gcc_isa = '-msse3'
|
||||
if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4':
|
||||
if options.target == 'generic-4':
|
||||
gcc_isa = '-msse4.2'
|
||||
if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8':
|
||||
if options.target == 'generic-8':
|
||||
gcc_isa = '-mavx'
|
||||
if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \
|
||||
and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1):
|
||||
|
||||
185
stdlib.ispc
185
stdlib.ispc
@@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state,
|
||||
static inline void fastmath() {
|
||||
__fastmath();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// rdrand
|
||||
|
||||
static inline uniform bool rdrand(float * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
uniform int32 irand;
|
||||
uniform bool success = __rdrand_i32(&irand);
|
||||
if (success) {
|
||||
irand &= (1<<23)-1;
|
||||
*ptr = floatbits(0x3F800000 | irand)-1.0f;
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool rdrand(varying float * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
bool success = false;
|
||||
foreach_active (index) {
|
||||
uniform int32 irand;
|
||||
if (__rdrand_i32(&irand)) {
|
||||
// FIXME: it probably would be preferable, here and in the
|
||||
// following rdrand() function, to do the int->float stuff
|
||||
// in vector form. However, we need to be careful to not
|
||||
// clobber any existing already-set values in *ptr with
|
||||
// inactive lanes here...
|
||||
irand &= (1<<23)-1;
|
||||
*ptr = floatbits(0x3F800000 | irand)-1.0f;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool rdrand(float * ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
float * uniform ptrs[programCount];
|
||||
ptrs[programIndex] = ptr;
|
||||
|
||||
bool success = false;
|
||||
foreach_active (index) {
|
||||
uniform int32 irand;
|
||||
if (__rdrand_i32(&irand)) {
|
||||
irand &= (1<<23)-1;
|
||||
*ptrs[index] = floatbits(0x3F800000 | irand)-1.0f;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uniform bool rdrand(int16 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else
|
||||
return __rdrand_i16(ptr);
|
||||
}
|
||||
|
||||
static inline bool rdrand(varying int16 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
bool success = false;
|
||||
foreach_active (index) {
|
||||
uniform int16 irand;
|
||||
if (__rdrand_i16(&irand)) {
|
||||
*ptr = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool rdrand(int16 * ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
int16 * uniform ptrs[programCount];
|
||||
ptrs[programIndex] = ptr;
|
||||
bool success = false;
|
||||
|
||||
foreach_active (index) {
|
||||
uniform int16 irand;
|
||||
if (__rdrand_i16(&irand)) {
|
||||
*ptrs[index] = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uniform bool rdrand(int32 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else
|
||||
return __rdrand_i32(ptr);
|
||||
}
|
||||
|
||||
static inline bool rdrand(varying int32 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
bool success = false;
|
||||
foreach_active (index) {
|
||||
uniform int32 irand;
|
||||
if (__rdrand_i32(&irand)) {
|
||||
*ptr = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool rdrand(int32 * ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
int32 * uniform ptrs[programCount];
|
||||
ptrs[programIndex] = ptr;
|
||||
bool success = false;
|
||||
|
||||
foreach_active (index) {
|
||||
uniform int32 irand;
|
||||
if (__rdrand_i32(&irand)) {
|
||||
*ptrs[index] = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline uniform bool rdrand(int64 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else
|
||||
return __rdrand_i64(ptr);
|
||||
}
|
||||
|
||||
static inline bool rdrand(varying int64 * uniform ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
bool success = false;
|
||||
foreach_active (index) {
|
||||
uniform int64 irand;
|
||||
if (__rdrand_i64(&irand)) {
|
||||
*ptr = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool rdrand(int64 * ptr) {
|
||||
if (__have_native_rand == false)
|
||||
return false;
|
||||
else {
|
||||
int64 * uniform ptrs[programCount];
|
||||
ptrs[programIndex] = ptr;
|
||||
bool success = false;
|
||||
|
||||
foreach_active (index) {
|
||||
uniform int64 irand;
|
||||
if (__rdrand_i64(&irand)) {
|
||||
*ptrs[index] = irand;
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
}
|
||||
|
||||
17
tests/gather-double-1.ispc
Normal file
17
tests/gather-double-1.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
17
tests/gather-double-2.ispc
Normal file
17
tests/gather-double-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-double-3.ispc
Normal file
21
tests/gather-double-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-double-4.ispc
Normal file
21
tests/gather-double-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-double-5.ispc
Normal file
19
tests/gather-double-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-double-6.ispc
Normal file
19
tests/gather-double-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-double-7.ispc
Normal file
23
tests/gather-double-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-double-8.ispc
Normal file
23
tests/gather-double-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform double a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
17
tests/gather-float-1.ispc
Normal file
17
tests/gather-float-1.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
17
tests/gather-float-2.ispc
Normal file
17
tests/gather-float-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-float-3.ispc
Normal file
21
tests/gather-float-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-float-4.ispc
Normal file
21
tests/gather-float-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-float-5.ispc
Normal file
19
tests/gather-float-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-float-6.ispc
Normal file
19
tests/gather-float-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-float-7.ispc
Normal file
23
tests/gather-float-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-float-8.ispc
Normal file
23
tests/gather-float-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform float a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
@@ -1,19 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
uniform int16 x[programCount];
|
||||
x[programIndex] = programIndex;
|
||||
int a = aFOO[programIndex]-1;
|
||||
unsigned int16 v;
|
||||
if (programIndex < 2)
|
||||
v = x[a];
|
||||
else
|
||||
v = 2;
|
||||
RET[programIndex] = v;
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2;
|
||||
RET[0] = 0;
|
||||
RET[1] = 1;
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
|
||||
17
tests/gather-int16-2.ispc
Normal file
17
tests/gather-int16-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-int16-3.ispc
Normal file
21
tests/gather-int16-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-int16-4.ispc
Normal file
21
tests/gather-int16-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-int16-5.ispc
Normal file
19
tests/gather-int16-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-int16-6.ispc
Normal file
19
tests/gather-int16-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-int16-7.ispc
Normal file
23
tests/gather-int16-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-int16-8.ispc
Normal file
23
tests/gather-int16-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int16 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
17
tests/gather-int32-1.ispc
Normal file
17
tests/gather-int32-1.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
17
tests/gather-int32-2.ispc
Normal file
17
tests/gather-int32-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-int32-3.ispc
Normal file
21
tests/gather-int32-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-int32-4.ispc
Normal file
21
tests/gather-int32-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-int32-5.ispc
Normal file
19
tests/gather-int32-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-int32-6.ispc
Normal file
19
tests/gather-int32-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-int32-7.ispc
Normal file
23
tests/gather-int32-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-int32-8.ispc
Normal file
23
tests/gather-int32-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
17
tests/gather-int64-1.ispc
Normal file
17
tests/gather-int64-1.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
17
tests/gather-int64-2.ispc
Normal file
17
tests/gather-int64-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-int64-3.ispc
Normal file
21
tests/gather-int64-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-int64-4.ispc
Normal file
21
tests/gather-int64-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-int64-5.ispc
Normal file
19
tests/gather-int64-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-int64-6.ispc
Normal file
19
tests/gather-int64-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-int64-7.ispc
Normal file
23
tests/gather-int64-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-int64-8.ispc
Normal file
23
tests/gather-int64-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int64 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
@@ -1,19 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) {
|
||||
uniform int8 x[programCount];
|
||||
x[programIndex] = programIndex;
|
||||
int a = aFOO[programIndex]-1;
|
||||
unsigned int8 v;
|
||||
if (programIndex < 2)
|
||||
v = x[a];
|
||||
else
|
||||
v = 2;
|
||||
RET[programIndex] = v;
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 2;
|
||||
RET[0] = 0;
|
||||
RET[1] = 1;
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
|
||||
17
tests/gather-int8-2.ispc
Normal file
17
tests/gather-int8-2.ispc
Normal file
@@ -0,0 +1,17 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
21
tests/gather-int8-3.ispc
Normal file
21
tests/gather-int8-3.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/gather-int8-4.ispc
Normal file
21
tests/gather-int8-4.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
if (programIndex < 2)
|
||||
g = a[programIndex+zero];
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
19
tests/gather-int8-5.ispc
Normal file
19
tests/gather-int8-5.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
19
tests/gather-int8-6.ispc
Normal file
19
tests/gather-int8-6.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
|
||||
int g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1 + programIndex;
|
||||
}
|
||||
23
tests/gather-int8-7.ispc
Normal file
23
tests/gather-int8-7.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
23
tests/gather-int8-8.ispc
Normal file
23
tests/gather-int8-8.ispc
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
int64 zero = 0;
|
||||
void *gptr;
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
uniform int8 a[programCount];
|
||||
for (uniform int i = 0; i < programCount; ++i)
|
||||
a[i] = aFOO[i];
|
||||
|
||||
int g = 0;
|
||||
int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex);
|
||||
if (programIndex < 2)
|
||||
g = *ptr;
|
||||
RET[programIndex] = g;
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
RET[0] = 1;
|
||||
RET[1] = 2;
|
||||
}
|
||||
21
tests/rdrand-1.ispc
Normal file
21
tests/rdrand-1.ispc
Normal file
@@ -0,0 +1,21 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 1;
|
||||
#else
|
||||
|
||||
uniform float r = -1;
|
||||
uniform int count = 0;
|
||||
while (!rdrand(&r)) {
|
||||
++count;
|
||||
}
|
||||
RET[programIndex] = (r >= 0 && r < 1);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
19
tests/rdrand-2.ispc
Normal file
19
tests/rdrand-2.ispc
Normal file
@@ -0,0 +1,19 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 1;
|
||||
#else
|
||||
|
||||
float r = -1;
|
||||
while (!rdrand(&r))
|
||||
;
|
||||
RET[programIndex] = (r >= 0 && r < 1);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
25
tests/rdrand-3.ispc
Normal file
25
tests/rdrand-3.ispc
Normal file
@@ -0,0 +1,25 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 1;
|
||||
#else
|
||||
|
||||
int lessHalf = 0, moreHalf = 0;
|
||||
for (uniform int i = 0; i < 1024*1024; ++i) {
|
||||
float r = -1;
|
||||
while (!rdrand(&r))
|
||||
;
|
||||
if (r < 0.5) ++lessHalf;
|
||||
else ++moreHalf;
|
||||
}
|
||||
|
||||
float r = (double)lessHalf / (double)(lessHalf + moreHalf);
|
||||
RET[programIndex] = (r >= .49 && r < .51);
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 1;
|
||||
}
|
||||
33
tests/rdrand-4.ispc
Normal file
33
tests/rdrand-4.ispc
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 0;
|
||||
#else
|
||||
|
||||
uniform int set[64] = { 0 };
|
||||
uniform int count = 1024*1024;
|
||||
for (uniform int i = 0; i < count; ++i) {
|
||||
uniform int64 r;
|
||||
while (!rdrand(&r))
|
||||
;
|
||||
for (uniform int b = 0; b < 64; ++b)
|
||||
if (((unsigned int64)r >> b) & 1)
|
||||
++set[b];
|
||||
}
|
||||
|
||||
RET[programIndex] = 0;
|
||||
for (uniform int b = 0; b < 64; ++b) {
|
||||
float r = (double)set[b] / (double)(count);
|
||||
if (!(r >= .49 && r < .51)) {
|
||||
print("% % - %\n", b, r, set[b]);
|
||||
++RET[programIndex];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
33
tests/rdrand-5.ispc
Normal file
33
tests/rdrand-5.ispc
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 0;
|
||||
#else
|
||||
|
||||
int set[32] = { 0 };
|
||||
uniform int count = 1024*1024;
|
||||
for (uniform int i = 0; i < count; ++i) {
|
||||
int32 r;
|
||||
while (!rdrand(&r))
|
||||
;
|
||||
for (uniform int b = 0; b < 32; ++b)
|
||||
if (((unsigned int32)r >> b) & 1)
|
||||
++set[b];
|
||||
}
|
||||
|
||||
RET[programIndex] = 0;
|
||||
for (uniform int b = 0; b < 32; ++b) {
|
||||
float r = (double)set[b] / (double)(count);
|
||||
if (!(r >= .49 && r < .51)) {
|
||||
print("% % - %\n", b, r, set[b]);
|
||||
++RET[programIndex];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
35
tests/rdrand-6.ispc
Normal file
35
tests/rdrand-6.ispc
Normal file
@@ -0,0 +1,35 @@
|
||||
|
||||
export uniform int width() { return programCount; }
|
||||
|
||||
export void f_f(uniform float RET[], uniform float aFOO[]) {
|
||||
#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2)
|
||||
RET[programIndex] = 0;
|
||||
#else
|
||||
|
||||
int set[32] = { 0 };
|
||||
uniform int count = 1024*1024;
|
||||
for (uniform int i = 0; i < count; ++i) {
|
||||
uniform int32 rr[programCount];
|
||||
int * ptr = rr + programIndex;
|
||||
while (!rdrand(ptr))
|
||||
;
|
||||
int32 r = rr[programIndex];
|
||||
for (uniform int b = 0; b < 32; ++b)
|
||||
if (((unsigned int32)r >> b) & 1)
|
||||
++set[b];
|
||||
}
|
||||
|
||||
RET[programIndex] = 0;
|
||||
for (uniform int b = 0; b < 32; ++b) {
|
||||
float r = (double)set[b] / (double)(count);
|
||||
if (!(r >= .49 && r < .51)) {
|
||||
print("% % - %\n", b, r, set[b]);
|
||||
++RET[programIndex];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
export void result(uniform float RET[]) {
|
||||
RET[programIndex] = 0;
|
||||
}
|
||||
Reference in New Issue
Block a user