diff --git a/builtins.cpp b/builtins.cpp index 00f72fc8..64f06e1f 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -476,6 +476,9 @@ lSetInternalFunctions(llvm::Module *module) { "__prefetch_read_uniform_nt", "__rcp_uniform_float", "__rcp_varying_float", + "__rdrand_i16", + "__rdrand_i32", + "__rdrand_i64", "__reduce_add_double", "__reduce_add_float", "__reduce_add_int32", diff --git a/builtins/dispatch.ll b/builtins/dispatch.ll index b9db3543..f1d5a969 100644 --- a/builtins/dispatch.ll +++ b/builtins/dispatch.ll @@ -76,18 +76,19 @@ declare void @abort() noreturn ;; /* NOTE: the values returned below must be the same as the ;; corresponding enumerant values in Target::ISA. */ ;; if ((info[2] & (1 << 28)) != 0) { -;; // AVX1 for sure. Do we have AVX2? -;; // Call cpuid with eax=7, ecx=0 -;; __cpuid_count(info, 7, 0); -;; if ((info[1] & (1 << 5)) != 0) -;; return 4; // AVX2 -;; else { -;; if ((info[2] & (1 << 29)) != 0 && // F16C -;; (info[2] & (1 << 30)) != 0) // RDRAND -;; return 3; // AVX1 on IVB -;; else -;; return 2; // AVX1 -;; } +;; if ((info[2] & (1 << 29)) != 0 && // F16C +;; (info[2] & (1 << 30)) != 0) { // RDRAND +;; // So far, so good. AVX2? +;; // Call cpuid with eax=7, ecx=0 +;; int info2[4]; +;; __cpuid_count(info2, 7, 0); +;; if ((info2[1] & (1 << 5)) != 0) +;; return 4; +;; else +;; return 3; +;; } +;; // Regular AVX +;; return 2; ;; } ;; else if ((info[2] & (1 << 19)) != 0) ;; return 1; // SSE4 @@ -104,40 +105,37 @@ entry: %asmresult6.i = extractvalue { i32, i32, i32, i32 } %0, 3 %and = and i32 %asmresult5.i, 268435456 %cmp = icmp eq i32 %and, 0 - br i1 %cmp, label %if.else14, label %if.then + br i1 %cmp, label %if.else13, label %if.then if.then: ; preds = %entry - %1 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind - %asmresult4.i29 = extractvalue { i32, i32, i32, i32 } %1, 1 - %and3 = and i32 %asmresult4.i29, 32 - %cmp4 = icmp eq i32 %and3, 0 - br i1 %cmp4, label %if.else, label %return + %1 = and i32 %asmresult5.i, 1610612736 + %2 = icmp eq i32 %1, 1610612736 + br i1 %2, label %if.then7, label %return -if.else: ; preds = %if.then - %asmresult5.i30 = extractvalue { i32, i32, i32, i32 } %1, 2 - %2 = and i32 %asmresult5.i30, 1610612736 - %3 = icmp eq i32 %2, 1610612736 - br i1 %3, label %return, label %if.else13 - -if.else13: ; preds = %if.else +if.then7: ; preds = %if.then + %3 = tail call { i32, i32, i32, i32 } asm sideeffect "xchg$(l$)\09$(%$)ebx, $1\0A\09cpuid\0A\09xchg$(l$)\09$(%$)ebx, $1\0A\09", "={ax},=r,={cx},={dx},0,2,~{dirflag},~{fpsr},~{flags}"(i32 7, i32 0) nounwind + %asmresult4.i28 = extractvalue { i32, i32, i32, i32 } %3, 1 + %and10 = lshr i32 %asmresult4.i28, 5 + %4 = and i32 %and10, 1 + %5 = add i32 %4, 3 br label %return -if.else14: ; preds = %entry - %and16 = and i32 %asmresult5.i, 524288 - %cmp17 = icmp eq i32 %and16, 0 - br i1 %cmp17, label %if.else19, label %return +if.else13: ; preds = %entry + %and15 = and i32 %asmresult5.i, 524288 + %cmp16 = icmp eq i32 %and15, 0 + br i1 %cmp16, label %if.else18, label %return -if.else19: ; preds = %if.else14 - %and21 = and i32 %asmresult6.i, 67108864 - %cmp22 = icmp eq i32 %and21, 0 - br i1 %cmp22, label %if.else24, label %return +if.else18: ; preds = %if.else13 + %and20 = and i32 %asmresult6.i, 67108864 + %cmp21 = icmp eq i32 %and20, 0 + br i1 %cmp21, label %if.else23, label %return -if.else24: ; preds = %if.else19 +if.else23: ; preds = %if.else18 tail call void @abort() noreturn nounwind unreachable -return: ; preds = %if.else19, %if.else14, %if.else13, %if.else, %if.then - %retval.0 = phi i32 [ 2, %if.else13 ], [ 4, %if.then ], [ 3, %if.else ], [ 1, %if.else14 ], [ 0, %if.else19 ] +return: ; preds = %if.else18, %if.else13, %if.then7, %if.then + %retval.0 = phi i32 [ %5, %if.then7 ], [ 2, %if.then ], [ 1, %if.else13 ], [ 0, %if.else18 ] ret i32 %retval.0 } diff --git a/builtins/target-avx1-x2.ll b/builtins/target-avx1-x2.ll index efde5d10..562d7ff0 100644 --- a/builtins/target-avx1-x2.ll +++ b/builtins/target-avx1-x2.ll @@ -31,6 +31,8 @@ include(`target-avx-x2.ll') +rdrand_decls() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -71,9 +73,9 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx1.ll b/builtins/target-avx1.ll index 64f8ad33..9c86cab8 100644 --- a/builtins/target-avx1.ll +++ b/builtins/target-avx1.ll @@ -31,6 +31,8 @@ include(`target-avx.ll') +rdrand_decls() + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -71,9 +73,9 @@ declare @__float_to_half_varying( %v) nounwind read ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) diff --git a/builtins/target-avx11-x2.ll b/builtins/target-avx11-x2.ll index 884255df..1aa6345c 100644 --- a/builtins/target-avx11-x2.ll +++ b/builtins/target-avx11-x2.ll @@ -29,13 +29,55 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`NO_HALF_DECLARES', `1') +include(`target-avx-x2.ll') -include(`target-avx1-x2.ll') +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <16 x i32> @__min_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <16 x i32> %ret +} + +define <16 x i32> @__max_varying_int32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <16 x i32> %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <16 x i32> @__min_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <16 x i32> %ret +} + +define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonly alwaysinline { + binary4to16(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <16 x i32> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -86,4 +128,5 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } - +' +) diff --git a/builtins/target-avx11.ll b/builtins/target-avx11.ll index 35aebe91..fea0a7c2 100644 --- a/builtins/target-avx11.ll +++ b/builtins/target-avx11.ll @@ -29,13 +29,55 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -define(`NO_HALF_DECLARES', `1') +include(`target-avx.ll') -include(`target-avx1.ll') +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int min/max + +define <8 x i32> @__min_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pminsd, %0, %1) + ret <8 x i32> %ret +} + +define <8 x i32> @__max_varying_int32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pmaxsd, %0, %1) + ret <8 x i32> %ret +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; unsigned int min/max + +define <8 x i32> @__min_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pminud, %0, %1) + ret <8 x i32> %ret +} + +define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly alwaysinline { + binary4to8(ret, i32, @llvm.x86.sse41.pmaxud, %0, %1) + ret <8 x i32> %ret +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; gather + +gen_gather(i8) +gen_gather(i16) +gen_gather(i32) +gen_gather(float) +gen_gather(i64) +gen_gather(double) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -70,3 +112,4 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } +') diff --git a/builtins/target-avx2-x2.ll b/builtins/target-avx2-x2.ll index 1ca3443c..053fd078 100644 --- a/builtins/target-avx2-x2.ll +++ b/builtins/target-avx2-x2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -29,8 +29,16 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + include(`target-avx-x2.ll') +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -66,6 +74,9 @@ define <16 x i32> @__max_varying_uint32(<16 x i32>, <16 x i32>) nounwind readonl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -116,14 +127,435 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } - +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather +declare void @llvm.trap() noreturn nounwind + +; $1: type +; $2: var base name +define(`extract_4s', ` + %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_3 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> + %$2_4 = shufflevector <16 x $1> %$2, <16 x $1> undef, <4 x i32> +') + +; $1: type +; $2: var base name +define(`extract_8s', ` + %$2_1 = shufflevector <16 x $1> %$2, <16 x $1> undef, + <8 x i32> + %$2_2 = shufflevector <16 x $1> %$2, <16 x $1> undef, + <8 x i32> +') + +; $1: element type +; $2: ret name +; $3: v1 +; $4: v2 +define(`assemble_8s', ` + %$2 = shufflevector <8 x $1> %$3, <8 x $1> %$4, + <16 x i32> +') + +; $1: element type +; $2: ret name +; $3: v1 +; $4: v2 +; $5: v3 +; $6: v4 +define(`assemble_4s', ` + %$2_1 = shufflevector <4 x $1> %$3, <4 x $1> %$4, + <8 x i32> + %$2_2 = shufflevector <4 x $1> %$5, <4 x $1> %$6, + <8 x i32> + assemble_8s($1, $2, $2_1, $2_2) +') + +ifelse(LLVM_VERSION, `LLVM_3_0', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + gen_gather(i8) gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr, + <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <16 x i32> @__gather_base_offsets32_i32(i8 * %ptr, i32 %scale, <16 x i32> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + extract_8s(i32, offsets) + extract_8s(i32, vecmask) + + %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets_1, <8 x i32> %vecmask_1, i8 %scale8) + %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets_2, <8 x i32> %vecmask_2, i8 %scale8) + + assemble_8s(i32, v, v1, v2) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + extract_4s(i32, vecmask) + extract_4s(i64, offsets) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8) + %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x i32> %vecmask_3, i8 %scale8) + %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x i32> %vecmask_4, i8 %scale8) + + assemble_4s(i32, v, v1, v2, v3, v4) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather32_i32(<16 x i32> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + extract_8s(i32, ptrs) + extract_8s(i32, vecmask) + + %v1 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs_1, <8 x i32> %vecmask_1, i8 1) + %v2 = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs_2, <8 x i32> %vecmask_2, i8 1) + + assemble_8s(i32, v, v1, v2) + + ret <16 x i32> %v +} + + +define <16 x i32> @__gather64_i32(<16 x i64> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + extract_4s(i64, ptrs) + extract_4s(i32, vecmask) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1) + %v3 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x i32> %vecmask_3, i8 1) + %v4 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x i32> %vecmask_4, i8 1) + + assemble_4s(i32, v, v1, v2, v3, v4) + + ret <16 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr, + <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <16 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_8s(i32, offsets) + extract_8s(float, mask) + + %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets_1, <8 x float> %mask_1, i8 %scale8) + %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets_2, <8 x float> %mask_2, i8 %scale8) + + assemble_8s(float, v, v1, v2) + + ret <16 x float> %v +} + + +define <16 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_4s(i64, offsets) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8) + %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x float> %mask_3, i8 %scale8) + %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x float> %mask_4, i8 %scale8) + + assemble_4s(float, v, v1, v2, v3, v4) + + ret <16 x float> %v +} + + +define <16 x float> @__gather32_float(<16 x i32> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_8s(float, mask) + extract_8s(i32, ptrs) + + %v1 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs_1, <8 x float> %mask_1, i8 1) + %v2 = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs_2, <8 x float> %mask_2, i8 1) + + assemble_8s(float, v, v1, v2) + + ret <16 x float> %v +} + + +define <16 x float> @__gather64_float(<16 x i64> %ptrs, + <16 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <16 x i32> %vecmask to <16 x float> + extract_4s(i64, ptrs) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1) + %v3 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x float> %mask_3, i8 1) + %v4 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x float> %mask_4, i8 1) + + assemble_4s(float, v, v1, v2, v3, v4) + + ret <16 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <16 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i32, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + + +define <16 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i64, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x i64> %vecmask_3, i8 %scale8) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x i64> %vecmask_4, i8 %scale8) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + + +define <16 x i64> @__gather32_i64(<16 x i32> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i32, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_3, <4 x i64> %vecmask_3, i8 1) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_4, <4 x i64> %vecmask_4, i8 1) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + +define <16 x i64> @__gather64_i64(<16 x i64> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <16 x i32> %mask32 to <16 x i64> + extract_4s(i64, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v3 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x i64> %vecmask_3, i8 1) + %v4 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x i64> %vecmask_4, i8 1) + + assemble_4s(i64, v, v1, v2, v3, v4) + + ret <16 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <16 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <16 x i32> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i32, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_3, <4 x double> %vecmask_3, i8 %scale8) + %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_4, <4 x double> %vecmask_4, i8 %scale8) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <16 x i64> %offsets, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i64, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_3, <4 x double> %vecmask_3, i8 %scale8) + %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_4, <4 x double> %vecmask_4, i8 %scale8) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather32_double(<16 x i32> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i32, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1) + %v3 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_3, <4 x double> %vecmask_3, i8 1) + %v4 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_4, <4 x double> %vecmask_4, i8 1) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + + +define <16 x double> @__gather64_double(<16 x i64> %ptrs, + <16 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <16 x i32> %mask32 to <16 x i64> + %vecmask = bitcast <16 x i64> %vecmask64 to <16 x double> + extract_4s(i64, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1) + %v3 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_3, <4 x double> %vecmask_3, i8 1) + %v4 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_4, <4 x double> %vecmask_4, i8 1) + + assemble_4s(double, v, v1, v2, v3, v4) + + ret <16 x double> %v +} + +') diff --git a/builtins/target-avx2.ll b/builtins/target-avx2.ll index 7152657e..f4a0ee07 100644 --- a/builtins/target-avx2.ll +++ b/builtins/target-avx2.ll @@ -1,4 +1,4 @@ -;; Copyright (c) 2010-2011, Intel Corporation +;; Copyright (c) 2010-2012, Intel Corporation ;; All rights reserved. ;; ;; Redistribution and use in source and binary forms, with or without @@ -29,8 +29,16 @@ ;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +ifelse(LLVM_VERSION, `LLVM_3_0', `', + LLVM_VERSION, `LLVM_3_1', `', + `define(`HAVE_GATHER', `1')') + include(`target-avx.ll') +ifelse(LLVM_VERSION, `LLVM_3_0', `rdrand_decls()', + LLVM_VERSION, `LLVM_3_1', `rdrand_decls()', + `rdrand_definition()') + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; int min/max @@ -66,6 +74,9 @@ define <8 x i32> @__max_varying_uint32(<8 x i32>, <8 x i32>) nounwind readonly a ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float/half conversions +ifelse(LLVM_VERSION, `LLVM_3_0', ` +;; nothing to define... +', ` declare <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16>) nounwind readnone ; 0 is round nearest even declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readnone @@ -100,13 +111,323 @@ define i16 @__float_to_half_uniform(float %v) nounwind readnone { %r = extractelement <8 x i16> %rv, i32 0 ret i16 %r } +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather +declare void @llvm.trap() noreturn nounwind + +define(`extract_4s', ` + %$2_1 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> + %$2_2 = shufflevector <8 x $1> %$2, <8 x $1> undef, <4 x i32> +') + +ifelse(LLVM_VERSION, `LLVM_3_0', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', +LLVM_VERSION, `LLVM_3_1', ` +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double)', ` + gen_gather(i8) gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int32 gathers + +declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %target, i8 * %ptr, + <8 x i32> %indices, <8 x i32> %mask, i8 %scale) readonly nounwind +declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i32> %mask, i8 %scale) readonly nounwind + +define <8 x i32> @__gather_base_offsets32_i32(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + + %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * %ptr, + <8 x i32> %offsets, <8 x i32> %vecmask, i8 %scale8) + + ret <8 x i32> %v +} + + +define <8 x i32> @__gather_base_offsets64_i32(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + extract_4s(i32, vecmask) + extract_4s(i64, offsets) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i32> %vecmask_1, i8 %scale8) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i32> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i32> %v1, <4 x i32> %v2, + <8 x i32> + ret <8 x i32> %v +} + + +define <8 x i32> @__gather32_i32(<8 x i32> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %v = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8 * null, + <8 x i32> %ptrs, <8 x i32> %vecmask, i8 1) + ret <8 x i32> %v +} + + +define <8 x i32> @__gather64_i32(<8 x i64> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + extract_4s(i64, ptrs) + extract_4s(i32, vecmask) + + %v1 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i32> %vecmask_1, i8 1) + %v2 = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i32> %vecmask_2, i8 1) + + %v = shufflevector <4 x i32> %v1, <4 x i32> %v2, + <8 x i32> + ret <8 x i32> %v +} + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; float gathers + +declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %target, i8 * %ptr, + <8 x i32> %indices, <8 x float> %mask, i8 %scale8) readonly nounwind +declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %target, i8 * %ptr, + <4 x i64> %indices, <4 x float> %mask, i8 %scale8) readonly nounwind + +define <8 x float> @__gather_base_offsets32_float(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <8 x i32> %vecmask to <8 x float> + + %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * %ptr, + <8 x i32> %offsets, <8 x float> %mask, i8 %scale8) + + ret <8 x float> %v +} + + +define <8 x float> @__gather_base_offsets64_float(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %mask = bitcast <8 x i32> %vecmask to <8 x float> + extract_4s(i64, offsets) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x float> %mask_1, i8 %scale8) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x float> %mask_2, i8 %scale8) + + %v = shufflevector <4 x float> %v1, <4 x float> %v2, + <8 x i32> + ret <8 x float> %v +} + + +define <8 x float> @__gather32_float(<8 x i32> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <8 x i32> %vecmask to <8 x float> + + %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8 * null, + <8 x i32> %ptrs, <8 x float> %mask, i8 1) + + ret <8 x float> %v +} + + +define <8 x float> @__gather64_float(<8 x i64> %ptrs, + <8 x i32> %vecmask) nounwind readonly alwaysinline { + %mask = bitcast <8 x i32> %vecmask to <8 x float> + extract_4s(i64, ptrs) + extract_4s(float, mask) + + %v1 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x float> %mask_1, i8 1) + %v2 = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x float> %mask_2, i8 1) + + %v = shufflevector <4 x float> %v1, <4 x float> %v2, + <8 x i32> + ret <8 x float> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; int64 gathers + +declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i32> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind +declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %target, i8 * %ptr, + <4 x i64> %indices, <4 x i64> %mask, i8 %scale) readonly nounwind + +define <8 x i64> @__gather_base_offsets32_i64(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i32, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather_base_offsets64_i64(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i64, offsets) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x i64> %vecmask_1, i8 %scale8) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x i64> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather32_i64(<8 x i32> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + + extract_4s(i32, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + + +define <8 x i64> @__gather64_i64(<8 x i64> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask = sext <8 x i32> %mask32 to <8 x i64> + extract_4s(i64, ptrs) + extract_4s(i64, vecmask) + + %v1 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x i64> %vecmask_1, i8 1) + %v2 = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x i64> %vecmask_2, i8 1) + + %v = shufflevector <4 x i64> %v1, <4 x i64> %v2, + <8 x i32> + ret <8 x i64> %v +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; double gathers + +declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i64> %indices, <4 x double> %mask, i8 %scale) readonly nounwind +declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %target, i8 * %ptr, + <4 x i32> %indices, <4 x double> %mask, i8 %scale) readonly nounwind + +define <8 x double> @__gather_base_offsets32_double(i8 * %ptr, + i32 %scale, <8 x i32> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i32, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i32> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather_base_offsets64_double(i8 * %ptr, + i32 %scale, <8 x i64> %offsets, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %scale8 = trunc i32 %scale to i8 + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i64, offsets) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_1, <4 x double> %vecmask_1, i8 %scale8) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * %ptr, + <4 x i64> %offsets_2, <4 x double> %vecmask_2, i8 %scale8) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather32_double(<8 x i32> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i32, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8 * null, + <4 x i32> %ptrs_2, <4 x double> %vecmask_2, i8 1) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + ret <8 x double> %v +} + +define <8 x double> @__gather64_double(<8 x i64> %ptrs, + <8 x i32> %mask32) nounwind readonly alwaysinline { + %vecmask64 = sext <8 x i32> %mask32 to <8 x i64> + %vecmask = bitcast <8 x i64> %vecmask64 to <8 x double> + extract_4s(i64, ptrs) + extract_4s(double, vecmask) + + %v1 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_1, <4 x double> %vecmask_1, i8 1) + %v2 = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8 * null, + <4 x i64> %ptrs_2, <4 x double> %vecmask_2, i8 1) + + %v = shufflevector <4 x double> %v1, <4 x double> %v2, + <8 x i32> + + ret <8 x double> %v +} + +') diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll old mode 100755 new mode 100644 index 5e82b4f1..c5937c8e --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -34,12 +34,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 48e7b836..7b4cfd9c 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -32,11 +32,15 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128-v16:16:16-v32:32:32"; define(`MASK',`i1') +define(`HAVE_GATHER',`1') +define(`HAVE_SCATTER',`1') + include(`util.m4') stdlib_core() scans() reduce_equal(WIDTH) +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; broadcast/rotate/shuffle @@ -334,19 +338,19 @@ define void @__masked_store_blend_double(* nocapture, ;; gather/scatter define(`gather_scatter', ` -declare @__gather_base_offsets32_$1(i8 * nocapture, , - i32, , ) nounwind readonly -declare @__gather_base_offsets64_$1(i8 * nocapture, , - i32, , ) nounwind readonly +declare @__gather_base_offsets32_$1(i8 * nocapture, i32, , + ) nounwind readonly +declare @__gather_base_offsets64_$1(i8 * nocapture, i32, , + ) nounwind readonly declare @__gather32_$1(, ) nounwind readonly declare @__gather64_$1(, ) nounwind readonly -declare void @__scatter_base_offsets32_$1(i8* nocapture, , - i32, , , ) nounwind -declare void @__scatter_base_offsets64_$1(i8* nocapture, , - i32, , , ) nounwind +declare void @__scatter_base_offsets32_$1(i8* nocapture, i32, , + , ) nounwind +declare void @__scatter_base_offsets64_$1(i8* nocapture, i32, , + , ) nounwind declare void @__scatter32_$1(, , ) nounwind declare void @__scatter64_$1(, , diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index e0b7f40c..c6a3afe2 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -33,6 +33,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse2-x2.ll b/builtins/target-sse2-x2.ll index 0260971a..ad19f899 100644 --- a/builtins/target-sse2-x2.ll +++ b/builtins/target-sse2-x2.ll @@ -444,12 +444,12 @@ masked_load(double, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse2.ll b/builtins/target-sse2.ll index 5f40d1eb..6558adc8 100644 --- a/builtins/target-sse2.ll +++ b/builtins/target-sse2.ll @@ -575,12 +575,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 69461fcd..4b8751b5 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -33,6 +33,7 @@ ctlztz() define_prefetches() define_shuffles() aossoa() +rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/target-sse4-x2.ll b/builtins/target-sse4-x2.ll index ef3a7746..0f7cb355 100644 --- a/builtins/target-sse4-x2.ll +++ b/builtins/target-sse4-x2.ll @@ -371,12 +371,12 @@ masked_load(double, 8) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; gather/scatter -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/target-sse4.ll b/builtins/target-sse4.ll index ee57f6bd..b00bcfd6 100644 --- a/builtins/target-sse4.ll +++ b/builtins/target-sse4.ll @@ -474,12 +474,12 @@ masked_load(double, 8) ; define these with the macros from stdlib.m4 -gen_gather(i8) -gen_gather(i16) -gen_gather(i32) -gen_gather(float) -gen_gather(i64) -gen_gather(double) +gen_gather_factored(i8) +gen_gather_factored(i16) +gen_gather_factored(i32) +gen_gather_factored(float) +gen_gather_factored(i64) +gen_gather_factored(double) gen_scatter(i8) gen_scatter(i16) diff --git a/builtins/util.m4 b/builtins/util.m4 index ce25a761..a97336a7 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -1579,7 +1579,7 @@ declare void @__pseudo_masked_store_double( * nocapture, * nocapture, @__pseudo_gather32_i8(, ) nounwind readonly declare @__pseudo_gather32_i16(, ) nounwind readonly @@ -1621,31 +1606,106 @@ declare @__pseudo_gather64_float(, ) declare @__pseudo_gather64_i64(, ) nounwind readonly declare @__pseudo_gather64_double(, ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_double(i8 *, , i32, , - ) nounwind readonly +; The ImproveMemoryOps optimization pass finds these calls and then +; tries to convert them to be calls to gather functions that take a uniform +; base pointer and then a varying integer offset, when possible. +; +; For targets without a native gather instruction, it is best to factor the +; integer offsets like "{1/2/4/8} * varying_offset + constant_offset", +; where varying_offset includes non-compile time constant values, and +; constant_offset includes compile-time constant values. (The scalar loads +; generated in turn can then take advantage of the free offsetting and scale by +; 1/2/4/8 that is offered by the x86 addresisng modes.) +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_factored_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; int{32,64} offsets, uniform int32 offset_scale, +; int{32,64} offset_delta, mask) +; +; For targets with a gather instruction, it is better to just factor them into +; a gather from a uniform base pointer and then "{1/2/4/8} * offsets", where the +; offsets are int32/64 vectors. +; +; varying int{8,16,32,float,64,double} +; __pseudo_gather_base_offsets{32,64}_{i8,i16,i32,float,i64,double}(uniform int8 *base, +; uniform int32 offset_scale, int{32,64} offsets, mask) -declare @__pseudo_gather_base_offsets64_i8(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i16(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i32(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_float(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_i64(i8 *, , i32, , - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_double(i8 *, , i32, , - ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets32_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets32_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_factored_base_offsets64_i8(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i16(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i32(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_float(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_i64(i8 *, , i32, , + ) nounwind readonly +declare +@__pseudo_gather_factored_base_offsets64_double(i8 *, , i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets32_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets32_double(i8 *, i32, , + ) nounwind readonly + +declare +@__pseudo_gather_base_offsets64_i8(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i16(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i32(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_float(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_i64(i8 *, i32, , + ) nounwind readonly +declare +@__pseudo_gather_base_offsets64_double(i8 *, i32, , + ) nounwind readonly ; Similarly to the pseudo-gathers defined above, we also declare undefined ; pseudo-scatter instructions with signatures: @@ -1657,16 +1717,6 @@ declare @__pseudo_gather_base_offsets64_double(i8 *, , , ) nounwind declare void @__pseudo_scatter32_i16(, , ) nounwind @@ -1682,31 +1732,96 @@ declare void @__pseudo_scatter64_float(, , , , ) nounwind declare void @__pseudo_scatter64_double(, , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_double(i8 * nocapture, , i32, , - , ) nounwind +; And the ImproveMemoryOps optimization pass also finds these and +; either transforms them to scatters like: +; +; void __pseudo_scatter_factored_base_offsets{32,64}_i8(uniform int8 *base, +; varying int32 offsets, uniform int32 offset_scale, +; varying int{32,64} offset_delta, varying int8 values, mask) +; (and similarly for 16/32/64 bit values) +; +; Or, if the target has a native scatter instruction: +; +; void __pseudo_scatter_base_offsets{32,64}_i8(uniform int8 *base, +; uniform int32 offset_scale, varying int{32,64} offsets, +; varying int8 values, mask) +; (and similarly for 16/32/64 bit values) -declare void @__pseudo_scatter_base_offsets64_i8(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i16(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i32(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_float(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_i64(i8 * nocapture, , i32, , - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_double(i8 * nocapture, , i32, , - , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets32_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_factored_base_offsets64_i8(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i16(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i32(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_float(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_i64(i8 * nocapture, , i32, , + , ) nounwind +declare void +@__pseudo_scatter_factored_base_offsets64_double(i8 * nocapture, , i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets32_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets32_double(i8 * nocapture, i32, , + , ) nounwind + +declare void +@__pseudo_scatter_base_offsets64_i8(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i16(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i32(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_float(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_i64(i8 * nocapture, i32, , + , ) nounwind +declare void +@__pseudo_scatter_base_offsets64_double(i8 * nocapture, i32, , + , ) nounwind declare float @__log_uniform_float(float) nounwind readnone declare @__log_varying_float() nounwind readnone @@ -1834,143 +1949,246 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__usedouble( %pg64_d) %g32_8 = call @__gather32_i8( %v32, - %mask) + %mask) call void @__use8( %g32_8) %g32_16 = call @__gather32_i16( %v32, - %mask) + %mask) call void @__use16( %g32_16) %g32_32 = call @__gather32_i32( %v32, - %mask) + %mask) call void @__use32( %g32_32) %g32_f = call @__gather32_float( %v32, - %mask) + %mask) call void @__usefloat( %g32_f) %g32_64 = call @__gather32_i64( %v32, - %mask) + %mask) call void @__use64( %g32_64) %g32_d = call @__gather32_double( %v32, - %mask) + %mask) call void @__usedouble( %g32_d) %g64_8 = call @__gather64_i8( %v64, - %mask) + %mask) call void @__use8( %g64_8) %g64_16 = call @__gather64_i16( %v64, - %mask) + %mask) call void @__use16( %g64_16) %g64_32 = call @__gather64_i32( %v64, - %mask) + %mask) call void @__use32( %g64_32) %g64_f = call @__gather64_float( %v64, - %mask) + %mask) call void @__usefloat( %g64_f) %g64_64 = call @__gather64_i64( %v64, - %mask) + %mask) call void @__use64( %g64_64) %g64_d = call @__gather64_double( %v64, - %mask) + %mask) call void @__usedouble( %g64_d) +ifelse(HAVE_GATHER, `1', +` + %nfpgbo32_8 = call + @__pseudo_gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfpgbo32_8) + %nfpgbo32_16 = call + @__pseudo_gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfpgbo32_16) + %nfpgbo32_32 = call + @__pseudo_gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfpgbo32_32) + %nfpgbo32_f = call + @__pseudo_gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfpgbo32_f) + %nfpgbo32_64 = call + @__pseudo_gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfpgbo32_64) + %nfpgbo32_d = call + @__pseudo_gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfpgbo32_d) + + %nfpgbo64_8 = call + @__pseudo_gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfpgbo64_8) + %nfpgbo64_16 = call + @__pseudo_gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfpgbo64_16) + %nfpgbo64_32 = call + @__pseudo_gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfpgbo64_32) + %nfpgbo64_f = call + @__pseudo_gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfpgbo64_f) + %nfpgbo64_64 = call + @__pseudo_gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfpgbo64_64) + %nfpgbo64_d = call + @__pseudo_gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfpgbo64_d) + + %nfgbo32_8 = call + @__gather_base_offsets32_i8(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use8( %nfgbo32_8) + %nfgbo32_16 = call + @__gather_base_offsets32_i16(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use16( %nfgbo32_16) + %nfgbo32_32 = call + @__gather_base_offsets32_i32(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use32( %nfgbo32_32) + %nfgbo32_f = call + @__gather_base_offsets32_float(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usefloat( %nfgbo32_f) + %nfgbo32_64 = call + @__gather_base_offsets32_i64(i8 * %ptr, i32 0, + %v32, %mask) + call void @__use64( %nfgbo32_64) + %nfgbo32_d = call + @__gather_base_offsets32_double(i8 * %ptr, i32 0, + %v32, %mask) + call void @__usedouble( %nfgbo32_d) + + %nfgbo64_8 = call + @__gather_base_offsets64_i8(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use8( %nfgbo64_8) + %nfgbo64_16 = call + @__gather_base_offsets64_i16(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use16( %nfgbo64_16) + %nfgbo64_32 = call + @__gather_base_offsets64_i32(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use32( %nfgbo64_32) + %nfgbo64_f = call + @__gather_base_offsets64_float(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usefloat( %nfgbo64_f) + %nfgbo64_64 = call + @__gather_base_offsets64_i64(i8 * %ptr, i32 0, + %v64, %mask) + call void @__use64( %nfgbo64_64) + %nfgbo64_d = call + @__gather_base_offsets64_double(i8 * %ptr, i32 0, + %v64, %mask) + call void @__usedouble( %nfgbo64_d) +', +` %pgbo32_8 = call - @__pseudo_gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use8( %pgbo32_8) %pgbo32_16 = call - @__pseudo_gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use16( %pgbo32_16) %pgbo32_32 = call - @__pseudo_gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use32( %pgbo32_32) %pgbo32_f = call - @__pseudo_gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usefloat( %pgbo32_f) %pgbo32_64 = call - @__pseudo_gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__use64( %pgbo32_64) %pgbo32_d = call - @__pseudo_gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, + @__pseudo_gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, %mask) call void @__usedouble( %pgbo32_d) - %gbo32_8 = call - @__gather_base_offsets32_i8(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use8( %gbo32_8) - %gbo32_16 = call - @__gather_base_offsets32_i16(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use16( %gbo32_16) - %gbo32_32 = call - @__gather_base_offsets32_i32(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use32( %gbo32_32) - %gbo32_f = call - @__gather_base_offsets32_float(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usefloat( %gbo32_f) - %gbo32_64 = call - @__gather_base_offsets32_i64(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__use64( %gbo32_64) - %gbo32_d = call - @__gather_base_offsets32_double(i8 * %ptr, %v32, i32 0, - %v32, %mask) - call void @__usedouble( %gbo32_d) - - %pgbo64_8 = call - @__pseudo_gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use8( %pgbo64_8) %pgbo64_16 = call - @__pseudo_gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use16( %pgbo64_16) %pgbo64_32 = call - @__pseudo_gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use32( %pgbo64_32) %pgbo64_f = call - @__pseudo_gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usefloat( %pgbo64_f) %pgbo64_64 = call - @__pseudo_gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__use64( %pgbo64_64) %pgbo64_d = call - @__pseudo_gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, + @__pseudo_gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, %mask) call void @__usedouble( %pgbo64_d) + %gbo32_8 = call + @__gather_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use8( %gbo32_8) + %gbo32_16 = call + @__gather_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use16( %gbo32_16) + %gbo32_32 = call + @__gather_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use32( %gbo32_32) + %gbo32_f = call + @__gather_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usefloat( %gbo32_f) + %gbo32_64 = call + @__gather_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__use64( %gbo32_64) + %gbo32_d = call + @__gather_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, + %v32, %mask) + call void @__usedouble( %gbo32_d) + %gbo64_8 = call - @__gather_base_offsets64_i8(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__gather_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use8( %gbo64_8) %gbo64_16 = call - @__gather_base_offsets64_i16(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__gather_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use16( %gbo64_16) %gbo64_32 = call - @__gather_base_offsets64_i32(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__gather_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use32( %gbo64_32) %gbo64_f = call - @__gather_base_offsets64_float(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__gather_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__usefloat( %gbo64_f) %gbo64_64 = call - @__gather_base_offsets64_i64(i8 * %ptr, %v64, i32 0, - %v64, %mask) + @__gather_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, + %v64, %mask) call void @__use64( %gbo64_64) %gbo64_d = call - @__gather_base_offsets64_double(i8 * %ptr, %v64, i32 0, - %v64, %mask) - call void @__usedouble( %gbo64_d) + @__gather_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, + %v64, %mask) + call void @__usedouble( %pgbo64_d) +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; scatters @@ -2003,61 +2221,118 @@ define void @__keep_funcs_live(i8 * %ptr, %v8, %v16, call void @__scatter64_i64( %v64, %v64, %mask) call void @__scatter64_double( %v64, %vd, %mask) - call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, +ifelse(HAVE_SCATTER, `1', +` + call void @__pseudo_scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, %v8, %mask) - call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, %v16, %mask) - call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, %v32, %mask) - call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, %vf, %mask) - call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, %v64, %mask) - call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + call void @__pseudo_scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, %vd, %mask) - call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, %v8, %mask) - call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, %v16, %mask) - call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, %v32, %mask) - call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, %vf, %mask) - call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, %v64, %mask) - call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + call void @__pseudo_scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, %vd, %mask) - call void @__scatter_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, - %v8, %mask) - call void @__scatter_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, - %v16, %mask) - call void @__scatter_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, - %v32, %mask) - call void @__scatter_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, - %vf, %mask) - call void @__scatter_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, - %v64, %mask) - call void @__scatter_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, - %vd, %mask) + call void @__scatter_base_offsets32_i8(i8 * %ptr, i32 0, %v32, + %v8, %mask) + call void @__scatter_base_offsets32_i16(i8 * %ptr, i32 0, %v32, + %v16, %mask) + call void @__scatter_base_offsets32_i32(i8 * %ptr, i32 0, %v32, + %v32, %mask) + call void @__scatter_base_offsets32_float(i8 * %ptr, i32 0, %v32, + %vf, %mask) + call void @__scatter_base_offsets32_i64(i8 * %ptr, i32 0, %v32, + %v64, %mask) + call void @__scatter_base_offsets32_double(i8 * %ptr, i32 0, %v32, + %vd, %mask) - call void @__scatter_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, - %v8, %mask) - call void @__scatter_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, - %v16, %mask) - call void @__scatter_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, - %v32, %mask) - call void @__scatter_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, - %vf, %mask) - call void @__scatter_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, - %v64, %mask) - call void @__scatter_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, - %vd, %mask) + call void @__scatter_base_offsets64_i8(i8 * %ptr, i32 0, %v64, + %v8, %mask) + call void @__scatter_base_offsets64_i16(i8 * %ptr, i32 0, %v64, + %v16, %mask) + call void @__scatter_base_offsets64_i32(i8 * %ptr, i32 0, %v64, + %v32, %mask) + call void @__scatter_base_offsets64_float(i8 * %ptr, i32 0, %v64, + %vf, %mask) + call void @__scatter_base_offsets64_i64(i8 * %ptr, i32 0, %v64, + %v64, %mask) + call void @__scatter_base_offsets64_double(i8 * %ptr, i32 0, %v64, + %vd, %mask) +', +` + call void @__pseudo_scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__pseudo_scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + %vf, %mask) + call void @__pseudo_scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + call void @__pseudo_scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + %vd, %mask) + + call void @__pseudo_scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__pseudo_scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + %vf, %mask) + call void @__pseudo_scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + call void @__pseudo_scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + %vd, %mask) + + call void @__scatter_factored_base_offsets32_i8(i8 * %ptr, %v32, i32 0, %v32, + %v8, %mask) + call void @__scatter_factored_base_offsets32_i16(i8 * %ptr, %v32, i32 0, %v32, + %v16, %mask) + call void @__scatter_factored_base_offsets32_i32(i8 * %ptr, %v32, i32 0, %v32, + %v32, %mask) + call void @__scatter_factored_base_offsets32_float(i8 * %ptr, %v32, i32 0, %v32, + %vf, %mask) + call void @__scatter_factored_base_offsets32_i64(i8 * %ptr, %v32, i32 0, %v32, + %v64, %mask) + call void @__scatter_factored_base_offsets32_double(i8 * %ptr, %v32, i32 0, %v32, + %vd, %mask) + + call void @__scatter_factored_base_offsets64_i8(i8 * %ptr, %v64, i32 0, %v64, + %v8, %mask) + call void @__scatter_factored_base_offsets64_i16(i8 * %ptr, %v64, i32 0, %v64, + %v16, %mask) + call void @__scatter_factored_base_offsets64_i32(i8 * %ptr, %v64, i32 0, %v64, + %v32, %mask) + call void @__scatter_factored_base_offsets64_float(i8 * %ptr, %v64, i32 0, %v64, + %vf, %mask) + call void @__scatter_factored_base_offsets64_i64(i8 * %ptr, %v64, i32 0, %v64, + %v64, %mask) + call void @__scatter_factored_base_offsets64_double(i8 * %ptr, %v64, i32 0, %v64, + %vd, %mask) +') ret void } + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; vector ops @@ -3196,8 +3471,42 @@ pl_done: ;; ;; $1: scalar type for which to generate functions to do gathers +define(`gen_gather_general', ` +; fully general 32-bit gather, takes array of pointers encoded as vector of i32s +define @__gather32_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} + +; fully general 64-bit gather, takes array of pointers encoded as vector of i32s +define @__gather64_$1( %ptrs, + %vecmask) nounwind readonly alwaysinline { + %ret_ptr = alloca + per_lane(WIDTH, %vecmask, ` + %iptr_LANE_ID = extractelement %ptrs, i32 LANE + %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * + %val_LANE_ID = load $1 * %ptr_LANE_ID + %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE + store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID + ') + + %ret = load * %ret_ptr + ret %ret +} +') + ; vec width, type -define(`gen_gather', ` +define(`gen_gather_factored', ` ;; Define the utility function to do the gather operation for a single element ;; of the type define @__gather_elt32_$1(i8 * %ptr, %offsets, i32 %offset_scale, @@ -3245,7 +3554,7 @@ define @__gather_elt64_$1(i8 * %ptr, %offsets, i32 %o } -define @__gather_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets32_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3276,7 +3585,7 @@ define @__gather_base_offsets32_$1(i8 * %ptr, %offset ret %ret`'eval(WIDTH-1) } -define @__gather_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, +define @__gather_factored_base_offsets64_$1(i8 * %ptr, %offsets, i32 %offset_scale, %offset_delta, %vecmask) nounwind readonly alwaysinline { ; We can be clever and avoid the per-lane stuff for gathers if we are willing @@ -3307,37 +3616,42 @@ define @__gather_base_offsets64_$1(i8 * %ptr, %offset ret %ret`'eval(WIDTH-1) } -; fully general 32-bit gather, takes array of pointers encoded as vector of i32s -define @__gather32_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` - %iptr_LANE_ID = extractelement %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i32 %iptr_LANE_ID to $1 * - %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE - store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID - ') +gen_gather_general($1) +' +) - %ret = load * %ret_ptr - ret %ret +; vec width, type +define(`gen_gather', ` + +gen_gather_factored($1) + +define +@__gather_base_offsets32_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale_vec = bitcast i32 %offset_scale to <1 x i32> + %smear_scale = shufflevector <1 x i32> %scale_vec, <1 x i32> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets32_$1(i8 * %ptr, %scaled_offsets, i32 1, + zeroinitializer, %vecmask) + ret %v } -; fully general 64-bit gather, takes array of pointers encoded as vector of i32s -define @__gather64_$1( %ptrs, - %vecmask) nounwind readonly alwaysinline { - %ret_ptr = alloca - per_lane(WIDTH, %vecmask, ` - %iptr_LANE_ID = extractelement %ptrs, i32 LANE - %ptr_LANE_ID = inttoptr i64 %iptr_LANE_ID to $1 * - %val_LANE_ID = load $1 * %ptr_LANE_ID - %store_ptr_LANE_ID = getelementptr * %ret_ptr, i32 0, i32 LANE - store $1 %val_LANE_ID, $1 * %store_ptr_LANE_ID - ') - - %ret = load * %ret_ptr - ret %ret +define +@__gather_base_offsets64_$1(i8 * %ptr, i32 %offset_scale, + %offsets, + %vecmask) nounwind readonly alwaysinline { + %scale64 = zext i32 %offset_scale to i64 + %scale_vec = bitcast i64 %scale64 to <1 x i64> + %smear_scale = shufflevector <1 x i64> %scale_vec, <1 x i64> undef, + < forloop(i, 1, eval(WIDTH-1), `i32 0, ') i32 0 > + %scaled_offsets = mul %smear_scale, %offsets + %v = call @__gather_factored_base_offsets64_$1(i8 * %ptr, %scaled_offsets, + i32 1, zeroinitializer, %vecmask) + ret %v } + ' ) @@ -3391,7 +3705,7 @@ define void @__scatter_elt64_$1(i8 * %ptr, %offsets, i32 %offset_s ret void } -define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets32_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... @@ -3401,7 +3715,7 @@ define void @__scatter_base_offsets32_$1(i8* %base, %offsets, i32 ret void } -define void @__scatter_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, +define void @__scatter_factored_base_offsets64_$1(i8* %base, %offsets, i32 %offset_scale, %offset_delta, %values, %mask) nounwind alwaysinline { ;; And use the `per_lane' macro to do all of the per-lane work for scatter... @@ -3437,3 +3751,48 @@ define void @__scatter64_$1( %ptrs, %values, ' ) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +define(`rdrand_decls', ` +declare i1 @__rdrand_i16(i16 * nocapture) +declare i1 @__rdrand_i32(i32 * nocapture) +declare i1 @__rdrand_i64(i64 * nocapture) +') + +define(`rdrand_definition', ` +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; rdrand + +declare {i16, i32} @llvm.x86.rdrand.16() +declare {i32, i32} @llvm.x86.rdrand.32() +declare {i64, i32} @llvm.x86.rdrand.64() + +define i1 @__rdrand_i16(i16 * %ptr) { + %v = call {i16, i32} @llvm.x86.rdrand.16() + %v0 = extractvalue {i16, i32} %v, 0 + %v1 = extractvalue {i16, i32} %v, 1 + store i16 %v0, i16 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i32(i32 * %ptr) { + %v = call {i32, i32} @llvm.x86.rdrand.32() + %v0 = extractvalue {i32, i32} %v, 0 + %v1 = extractvalue {i32, i32} %v, 1 + store i32 %v0, i32 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} + +define i1 @__rdrand_i64(i64 * %ptr) { + %v = call {i64, i32} @llvm.x86.rdrand.64() + %v0 = extractvalue {i64, i32} %v, 0 + %v1 = extractvalue {i64, i32} %v, 1 + store i64 %v0, i64 * %ptr + %good = icmp ne i32 %v1, 0 + ret i1 %good +} +') diff --git a/docs/ispc.rst b/docs/ispc.rst index 7e671dbd..98250e39 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -140,6 +140,7 @@ Contents: * `Basic Math Functions`_ * `Transcendental Functions`_ * `Pseudo-Random Numbers`_ + * `Random Numbers`_ + `Output Functions`_ + `Assertions`_ @@ -2084,7 +2085,7 @@ can be declared: soa<8> Point pts[...]; -The in-memory layout of the ``Point``s has had the SOA transformation +The in-memory layout of the ``Point`` instances has had the SOA transformation applied, such that there are 8 ``x`` values in memory followed by 8 ``y`` values, and so forth. Here is the effective declaration of ``soa<8> Point``: @@ -2266,7 +2267,7 @@ based on C++'s ``new`` and ``delete`` operators: // use ptr... delete[] ptr; -In the above code, each program instance allocates its own ``count`-sized +In the above code, each program instance allocates its own ``count`` sized array of ``uniform int`` values, uses that memory, and then deallocates that memory. Uses of ``new`` and ``delete`` in ``ispc`` programs are serviced by corresponding calls the system C library's ``malloc()`` and @@ -2277,9 +2278,7 @@ analogous to the corresponding rules are for pointers (as described in `Pointer Types`_.) Specifically, if a specific rate qualifier isn't provided with the ``new`` expression, then the default is that a "varying" ``new`` is performed, where each program instance performs a unique -allocation. The allocated type, in turn, is by default ``uniform`` for -``varying`` ``new`` expressions, and ``varying`` for ``uniform`` new -expressions. +allocation. The allocated type, in turn, is by default ``uniform``. After a pointer has been deleted, it is illegal to access the memory it points to. However, that deletion happens on a per-program-instance basis. @@ -3457,6 +3456,40 @@ be used to get a pseudo-random ``float`` value. uniform unsigned int32 random(RNGState * uniform state) uniform float frandom(uniform RNGState * uniform state) + +Random Numbers +-------------- + +Some recent CPUs (including those based on the Intel(r) Ivy Bridge +micro-architecture), provide support for generating true random numbers. A +few standard library functions make this functionality available: + +:: + + bool rdrand(uniform int32 * uniform ptr) + bool rdrand(varying int32 * uniform ptr) + bool rdrand(uniform int32 * varying ptr) + +If the processor doesn't have sufficient entropy to generate a random +number, then this function fails and returns ``false``. Otherwise, if the +processor is successful, the random value is stored in the given pointer +and ``true`` is returned. Therefore, this function should generally be +used as follows, called repeatedly until it is successful: + +:: + + int r; + while (rdrand(&r) == false) + ; // empty loop body + + +In addition to the ``int32`` variants of ``rdrand()`` listed above, there +are versions that return ``int16``, ``float``, and ``int64`` values as +well. + +Note that when compiling to targets other than ``avx1.1`` and ``avx2``, the +``rdrand()`` functions always return ``false``. + Output Functions ---------------- @@ -3491,7 +3524,7 @@ generates the following output on a four-wide compilation target: :: i = 10, x = [0.000000,1.000000,2.000000,3.000000] - added to x = [1.000000,2.000000,((2.000000)),((3.000000)] + added to x = [1.000000,2.000000,((2.000000)),((3.000000))] last print of x = [1.000000,2.000000,2.000000,3.000000] When a varying variable is printed, the values for program instances that @@ -4010,8 +4043,8 @@ Systems Programming Support Atomic Operations and Memory Fences ----------------------------------- -The standard range of atomic memory operations are provided by the standard -library``ispc``, including variants to handle both uniform and varying +The standard set of atomic memory operations are provided by the standard +library, including variants to handle both uniform and varying types as well as "local" and "global" atomics. Local atomics provide atomic behavior across the program instances in a diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index 1851ff7e..42978701 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1307,15 +1307,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec16_d val, // offsets * offsetScale is in bytes (for all of these) #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec16_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ @@ -1362,14 +1360,13 @@ GATHER_GENERAL(__vec16_d, double, __vec16_i64, __gather64_double) // scatter #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, \ + __vec16_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 16; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 628aab84..94946f4a 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1375,15 +1375,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec32_d val, // offsets * offsetScale is in bytes (for all of these) #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec32_i1 mask) { \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec32_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ @@ -1430,14 +1428,12 @@ GATHER_GENERAL(__vec32_d, double, __vec32_i64, __gather64_double) // scatter #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec32_i1 mask) { \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec32_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 32; ++i) \ if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index 2630e306..ff84fee3 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1508,15 +1508,13 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec64_d val, // offsets * offsetScale is in bytes (for all of these) #define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec64_i1 mask) { \ +static FORCEINLINE VTYPE FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, __vec64_i1 mask) { \ VTYPE ret; \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ ret.v[i] = *ptr; \ } \ return ret; \ @@ -1540,7 +1538,7 @@ GATHER_BASE_OFFSETS(__vec64_d, double, __vec64_i64, __gather_base_offsets64_doub static FORCEINLINE VTYPE FUNC(PTRTYPE ptrs, __vec64_i1 mask) { \ VTYPE ret; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ + if ((mask.v & (1ull << i)) != 0) { \ STYPE *ptr = (STYPE *)ptrs.v[i]; \ ret.v[i] = *ptr; \ } \ @@ -1563,14 +1561,12 @@ GATHER_GENERAL(__vec64_d, double, __vec64_i64, __gather64_double) // scatter #define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) \ -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec64_i1 mask) { \ +static FORCEINLINE void FUNC(unsigned char *b, uint32_t scale, \ + OTYPE offset, VTYPE val, __vec64_i1 mask) { \ int8_t *base = (int8_t *)b; \ for (int i = 0; i < 64; ++i) \ - if ((mask.v & (1ull << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ + if ((mask.v & (1ull << i)) != 0) { \ + STYPE *ptr = (STYPE *)(base + scale * offset.v[i]); \ *ptr = val.v[i]; \ } \ } diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index eceeb885..404cd24f 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -803,6 +803,13 @@ template <> static FORCEINLINE void __store<64>(__vec16_i32 *p, __vec16_i32 v) { // int64 +static FORCEINLINE __vec16_i64 __setzero_i64() { + __vec16_i64 ret; + ret.v_lo = _mm512_setzero_epi32(); + ret.v_hi = _mm512_setzero_epi32(); + return ret; +} + static FORCEINLINE __vec16_i64 __add(const __vec16_i64 &a, const __vec16_i64 &b) { __mmask16 carry = 0; @@ -878,7 +885,7 @@ static FORCEINLINE int64_t __extract_element(const __vec16_i64 &v, int index) return src[index+16] | (int64_t(src[index]) << 32); } -static FORCEINLINE __vec16_i64 __smear_i64(__vec16_i64, const int64_t &l) { +static FORCEINLINE __vec16_i64 __smear_i64(const int64_t &l) { const int *i = (const int*)&l; return __vec16_i64(_mm512_set_1to16_epi32(i[0]), _mm512_set_1to16_epi32(i[1])); } @@ -1373,6 +1380,11 @@ CAST(__vec16_i32, int32_t, __vec16_i16, int16_t, __cast_sext) CAST(__vec16_i32, int32_t, __vec16_i8, int8_t, __cast_sext) CAST(__vec16_i16, int16_t, __vec16_i8, int8_t, __cast_sext) +static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) +{ + return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); +} + #define CAST_SEXT_I1(TYPE) /* static FORCEINLINE TYPE __cast_sext(TYPE, __vec16_i1 v) { \ @@ -1389,11 +1401,6 @@ CAST_SEXT_I1(__vec16_i8) CAST_SEXT_I1(__vec16_i16) CAST_SEXT_I1(__vec16_i32) -static FORCEINLINE __vec16_i64 __cast_sext(const __vec16_i64 &, const __vec16_i32 &val) -{ - return __vec16_i64(val.v,_mm512_srai_epi32(val.v,31)); -} - // zero extension CAST(__vec16_i64, uint64_t, __vec16_i32, uint32_t, __cast_zext) CAST(__vec16_i64, uint64_t, __vec16_i16, uint16_t, __cast_zext) @@ -1421,6 +1428,14 @@ CAST_ZEXT_I1(__vec16_i16) CAST_ZEXT_I1(__vec16_i32) CAST_ZEXT_I1(__vec16_i64) +static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(1); + return _mm512_mask_mov_epi32(ret, val.m, one); +} + + // truncations CAST(__vec16_i32, int32_t, __vec16_i64, int64_t, __cast_trunc) CAST(__vec16_i16, int16_t, __vec16_i64, int64_t, __cast_trunc) @@ -1589,11 +1604,6 @@ CAST_BITS_SCALAR(double, int64_t) /////////////////////////////////////////////////////////////////////////// // various math functions -/* -static FORCEINLINE void __fastmath() { -} -*/ - static FORCEINLINE float __round_uniform_float(float v) { return roundf(v); } @@ -1659,14 +1669,25 @@ static FORCEINLINE __vec16_f __min_varying_float(__vec16_f v1, __vec16_f v2) { return _mm512_gmin_ps(v1, v2); } +static FORCEINLINE __vec16_i32 __max_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_int32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epi32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __max_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_max_epu32(v1, v2); +} + +static FORCEINLINE __vec16_i32 __min_varying_uint32(__vec16_i32 v1, __vec16_i32 v2) { + return _mm512_min_epu32(v1, v2); +} + BINARY_OP_FUNC(__vec16_d, __max_varying_double, __max_uniform_double) BINARY_OP_FUNC(__vec16_d, __min_varying_double, __min_uniform_double) -BINARY_OP_FUNC(__vec16_i32, __max_varying_int32, __max_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_int32, __min_uniform_int32) -BINARY_OP_FUNC(__vec16_i32, __max_varying_uint32, __max_uniform_uint32) -BINARY_OP_FUNC(__vec16_i32, __min_varying_uint32, __min_uniform_uint32) - BINARY_OP_FUNC(__vec16_i64, __max_varying_int64, __max_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __min_varying_int64, __min_uniform_int64) BINARY_OP_FUNC(__vec16_i64, __max_varying_uint64, __max_uniform_uint64) @@ -1940,60 +1961,33 @@ static FORCEINLINE void __masked_store_blend_float(void *p, __vec16_f val, // offsets * offsetScale is in bytes (for all of these) -#define GATHER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE VTYPE FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - __vec16_i1 mask) { \ - VTYPE ret; \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - ret.v[i] = *ptr; \ - } \ - return ret; \ -} -*/ - static FORCEINLINE __vec16_i32 -__gather_base_offsets32_i32(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, - __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_i32 tmp; - - // Loop is generated by intrinsic +__gather_base_offsets32_i32(uint8_t *base, uint32_t scale, __vec16_i32 offsets, + __vec16_i1 mask) { + __vec16_i32 tmp = _mm512_undefined_epi32(); __vec16_i32 ret = _mm512_mask_i32extgather_epi32(tmp, mask, offsets, base, - _MM_UPCONV_EPI32_NONE, 1, + _MM_UPCONV_EPI32_NONE, scale, _MM_HINT_NONE); return ret; } static FORCEINLINE __vec16_f -__gather_base_offsets32_float(uint8_t *base, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__gather_base_offsets32_float(uint8_t *base, uint32_t scale, __vec16_i32 offsets, __vec16_i1 mask) { - __vec16_i32 vscale = _mm512_extload_epi32(&scale, _MM_UPCONV_EPI32_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); - __vec16_i32 offsets = __add(__mul(vscale, varyingOffset), constOffset); - __vec16_f tmp; - - // Loop is generated by intrinsic - __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, - _MM_UPCONV_PS_NONE, 1, + __vec16_f tmp = _mm512_undefined_ps(); + __vec16_f ret = _mm512_mask_i32extgather_ps(tmp, mask, offsets, base, + _MM_UPCONV_PS_NONE, scale, _MM_HINT_NONE); return ret; } -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) -GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) -GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) -GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) -GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __gather_base_offsets32_i8) +//GATHER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __gather_base_offsets64_i8) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __gather_base_offsets32_i16) +//GATHER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __gather_base_offsets64_i16) +//GATHER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __gather_base_offsets64_i32) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __gather_base_offsets32_i64) +//GATHER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __gather_base_offsets64_i64) #define GATHER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* @@ -2039,47 +2033,43 @@ static FORCEINLINE __vec16_i32 __gather64_i32(__vec16_i64 ptrs, __vec16_i1 mask) */ // scatter -#define SCATTER_BASE_OFFSETS(VTYPE, STYPE, OTYPE, FUNC) -/* -static FORCEINLINE void FUNC(unsigned char *b, OTYPE varyingOffset, \ - uint32_t scale, OTYPE constOffset, \ - VTYPE val, __vec16_i1 mask) { \ - int8_t *base = (int8_t *)b; \ - for (int i = 0; i < 16; ++i) \ - if ((mask.v & (1 << i)) != 0) { \ - STYPE *ptr = (STYPE *)(base + scale * varyingOffset.v[i] + \ - constOffset.v[i]); \ - *ptr = val.v[i]; \ - } \ -} -*/ - -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) -SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) -SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) -SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) -SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i32, __scatter_base_offsets32_i8) +//SCATTER_BASE_OFFSETS(__vec16_i8, int8_t, __vec16_i64, __scatter_base_offsets64_i8) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i32, __scatter_base_offsets32_i16) +//SCATTER_BASE_OFFSETS(__vec16_i16, int16_t, __vec16_i64, __scatter_base_offsets64_i16) +//SCATTER_BASE_OFFSETS(__vec16_i32, int32_t, __vec16_i64, __scatter_base_offsets64_i32) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i32, __scatter_base_offsets32_i64) +//SCATTER_BASE_OFFSETS(__vec16_i64, int64_t, __vec16_i64, __scatter_base_offsets64_i64) static FORCEINLINE void -__scatter_base_offsets32_i32(uint8_t *b, __vec16_i32 varyingOffset, - uint32_t scale, __vec16_i32 constOffset, +__scatter_base_offsets32_i32(uint8_t *b, uint32_t scale, __vec16_i32 offsets, __vec16_i32 val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(__vec16_i32(scale), varyingOffset), constOffset); - _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, _MM_DOWNCONV_EPI32_NONE, 1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_epi32(b, mask, offsets, val, + _MM_DOWNCONV_EPI32_NONE, scale, + _MM_HINT_NONE); } static FORCEINLINE void -__scatter_base_offsets32_float(void *base, const __vec16_i32 &varyingOffset, - uint32_t scale, const __vec16_i32 &constOffset, - const __vec16_f &val, const __vec16_i1 mask) +__scatter_base_offsets32_float(void *base, uint32_t scale, __vec16_i32 offsets, + __vec16_f val, __vec16_i1 mask) { - __vec16_i32 offsets = __add(__mul(varyingOffset,__vec16_i32(scale)), constOffset); - _mm512_mask_i32extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); + _mm512_mask_i32extscatter_ps(base, mask, offsets, val, + _MM_DOWNCONV_PS_NONE, scale, + _MM_HINT_NONE); } +/* +static FORCEINLINE void +__scatter_base_offsets64_float(void *base, const __vec16_i64 &varyingOffset, + uint32_t scale, const __vec16_i64 &constOffset, + const __vec16_f &val, const __vec16_i1 mask) +{ + __vec16_i64 offsets = __add(__mul(varyingOffset,__vec16_i64(scale)), constOffset); + _mm512_mask_i64extscatter_ps(base, mask, offsets, val, _MM_DOWNCONV_PS_NONE, _MM_SCALE_1, _MM_HINT_NONE); +} +*/ + #define SCATTER_GENERAL(VTYPE, STYPE, PTRTYPE, FUNC) /* static FORCEINLINE void FUNC(PTRTYPE ptrs, VTYPE val, __vec16_i1 mask) { \ diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index fcc14618..17ab8f18 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -2892,54 +2892,53 @@ static FORCEINLINE void __masked_store_blend_double(void *p, __vec4_d val, template static FORCEINLINE RetVec -lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i32 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i32()); - constOffset = __select(mask, constOffset, __setzero_i32()); - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + offset = scale * _mm_extract_epi32(offsets.v, 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + offset = scale * _mm_extract_epi32(offsets.v, 2); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + offset = scale * _mm_extract_epi32(offsets.v, 3); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 0) + _mm_extract_epi32(constOffset.v, 0); + int offset = scale * _mm_extract_epi32(offsets.v, 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 1) + _mm_extract_epi32(constOffset.v, 1); + int offset = scale * _mm_extract_epi32(offsets.v, 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 2) + _mm_extract_epi32(constOffset.v, 2); + int offset = scale * _mm_extract_epi32(offsets.v, 2); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int offset = scale * _mm_extract_epi32(offsets.v, 3) + _mm_extract_epi32(constOffset.v, 3); + int offset = scale * _mm_extract_epi32(offsets.v, 3); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -2950,54 +2949,53 @@ lGatherBaseOffsets32(RetVec, RetScalar, unsigned char *p, __vec4_i32 offsets, template static FORCEINLINE RetVec -lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { +lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, uint32_t scale, + __vec4_i64 offsets, __vec4_i1 mask) { RetScalar r[4]; #if 1 // "Fast gather" trick... offsets = __select(mask, offsets, __setzero_i64()); - constOffset = __select(mask, constOffset, __setzero_i64()); - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + offset = scale * _mm_extract_epi64(offsets.v[0], 1); ptr = (RetScalar *)(p + offset); r[1] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + offset = scale * _mm_extract_epi64(offsets.v[1], 0); ptr = (RetScalar *)(p + offset); r[2] = *ptr; - offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + offset = scale * _mm_extract_epi64(offsets.v[1], 1); ptr = (RetScalar *)(p + offset); r[3] = *ptr; #else uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + _mm_extract_epi64(constOffset.v[0], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[0] = *ptr; } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + _mm_extract_epi64(constOffset.v[0], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[1] = *ptr; } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + _mm_extract_epi64(constOffset.v[1], 0); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); RetScalar *ptr = (RetScalar *)(p + offset); r[2] = *ptr; } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + _mm_extract_epi64(constOffset.v[1], 1); + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); RetScalar *ptr = (RetScalar *)(p + offset); r[3] = *ptr; } @@ -3007,87 +3005,75 @@ lGatherBaseOffsets64(RetVec, RetScalar, unsigned char *p, __vec4_i64 offsets, } static FORCEINLINE __vec4_i8 -__gather_base_offsets32_i8(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i8(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i8 -__gather_base_offsets64_i8(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i8(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i8(), uint8_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 -__gather_base_offsets32_i16(unsigned char *b, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i16(unsigned char *b, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i16 - __gather_base_offsets64_i16(unsigned char *b, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i16(unsigned char *b, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i16(), uint16_t(), b, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets32_i32(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i32(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i32 -__gather_base_offsets64_i32(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i32(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i32(), uint32_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets32_float(uint8_t *p, __vec4_i32 offsets, uint32_t scale, - __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_float(uint8_t *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_f -__gather_base_offsets64_float(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_f(), float(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_float(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_f(), float(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_i64 -__gather_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_i64(), uint64_t(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets32(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets32(__vec4_d(), double(), p, scale, offsets, mask); } static FORCEINLINE __vec4_d -__gather_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_i1 mask) { - return lGatherBaseOffsets64(__vec4_d(), double(), p, offsets, scale, - constOffset, mask); +__gather_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i1 mask) { + return lGatherBaseOffsets64(__vec4_d(), double(), p, scale, offsets, mask); } template @@ -3252,63 +3238,55 @@ static FORCEINLINE __vec4_d __gather64_double(__vec4_i64 ptrs, __vec4_i1 mask) { #define SCATTER32_64(SUFFIX, VEC_SUFFIX, TYPE, EXTRACT) \ static FORCEINLINE void \ -__scatter_base_offsets32_##SUFFIX (unsigned char *b, __vec4_i32 offsets, \ - uint32_t scale, __vec4_i32 constOffset, \ +__scatter_base_offsets32_##SUFFIX (unsigned char *b, uint32_t scale, \ + __vec4_i32 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0) + \ - _mm_extract_epi32(constOffset.v, 0)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 0)); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1) + \ - _mm_extract_epi32(constOffset.v, 1)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 1)); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2) + \ - _mm_extract_epi32(constOffset.v, 2)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 2)); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3) + \ - _mm_extract_epi32(constOffset.v, 3)); \ + TYPE *ptr = (TYPE *)(b + scale * _mm_extract_epi32(offsets.v, 3)); \ *ptr = EXTRACT(val.v, 3); \ } \ } \ -static FORCEINLINE void \ -__scatter_base_offsets64_##SUFFIX(unsigned char *p, __vec4_i64 offsets, \ - uint32_t scale, __vec4_i64 constOffset, \ +static FORCEINLINE void \ +__scatter_base_offsets64_##SUFFIX(unsigned char *p, uint32_t scale, \ + __vec4_i64 offsets, \ __vec4_##VEC_SUFFIX val, __vec4_i1 mask) { \ uint32_t m = _mm_extract_ps(mask.v, 0); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + \ - _mm_extract_epi64(constOffset.v[0], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 0); \ } \ m = _mm_extract_ps(mask.v, 1); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + \ - _mm_extract_epi64(constOffset.v[0], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 1); \ } \ m = _mm_extract_ps(mask.v, 2); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + \ - _mm_extract_epi64(constOffset.v[1], 0); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 2); \ } \ m = _mm_extract_ps(mask.v, 3); \ if (m != 0) { \ - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + \ - _mm_extract_epi64(constOffset.v[1], 1); \ + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); \ TYPE *ptr = (TYPE *)(p + offset); \ *ptr = EXTRACT(val.v, 3); \ } \ @@ -3322,91 +3300,79 @@ SCATTER32_64(float, f, float, _mm_extract_ps_as_float) static FORCEINLINE void -__scatter_base_offsets32_i64(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_i64 val, - __vec4_i1 mask) { - uint32_t m = _mm_extract_ps(mask.v, 0); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 0) + - _mm_extract_epi32(constOffset.v, 0); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 0); - } - - m = _mm_extract_ps(mask.v, 1); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 1) + - _mm_extract_epi32(constOffset.v, 1); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[0], 1); - } - - m = _mm_extract_ps(mask.v, 2); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 2) + - _mm_extract_epi32(constOffset.v, 2); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 0); - } - - m = _mm_extract_ps(mask.v, 3); - if (m != 0) { - int32_t offset = scale * _mm_extract_epi32(offsets.v, 3) + - _mm_extract_epi32(constOffset.v, 3); - uint64_t *ptr = (uint64_t *)(p + offset); - *ptr = _mm_extract_epi64(val.v[1], 1); - } -} - -static FORCEINLINE void -__scatter_base_offsets64_i64(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, +__scatter_base_offsets32_i64(unsigned char *p, uint32_t scale, __vec4_i32 offsets, __vec4_i64 val, __vec4_i1 mask) { uint32_t m = _mm_extract_ps(mask.v, 0); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0) + - _mm_extract_epi64(constOffset.v[0], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 0); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 0); } m = _mm_extract_ps(mask.v, 1); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1) + - _mm_extract_epi64(constOffset.v[0], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 1); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[0], 1); } m = _mm_extract_ps(mask.v, 2); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0) + - _mm_extract_epi64(constOffset.v[1], 0); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 2); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 0); } m = _mm_extract_ps(mask.v, 3); if (m != 0) { - int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1) + - _mm_extract_epi64(constOffset.v[1], 1); + int32_t offset = scale * _mm_extract_epi32(offsets.v, 3); uint64_t *ptr = (uint64_t *)(p + offset); *ptr = _mm_extract_epi64(val.v[1], 1); } } static FORCEINLINE void -__scatter_base_offsets32_double(unsigned char *p, __vec4_i32 offsets, - uint32_t scale, __vec4_i32 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_base_offsets32_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets64_i64(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_i64 val, __vec4_i1 mask) { + uint32_t m = _mm_extract_ps(mask.v, 0); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 0); + } + + m = _mm_extract_ps(mask.v, 1); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[0], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[0], 1); + } + + m = _mm_extract_ps(mask.v, 2); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 0); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 0); + } + + m = _mm_extract_ps(mask.v, 3); + if (m != 0) { + int64_t offset = scale * _mm_extract_epi64(offsets.v[1], 1); + uint64_t *ptr = (uint64_t *)(p + offset); + *ptr = _mm_extract_epi64(val.v[1], 1); + } } static FORCEINLINE void -__scatter_base_offsets64_double(unsigned char *p, __vec4_i64 offsets, - uint32_t scale, __vec4_i64 constOffset, __vec4_d val, - __vec4_i1 mask) { - __scatter_base_offsets64_i64(p, offsets, scale, constOffset, val, mask); +__scatter_base_offsets32_double(unsigned char *p, uint32_t scale, __vec4_i32 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets32_i64(p, scale, offsets, val, mask); +} + +static FORCEINLINE void +__scatter_base_offsets64_double(unsigned char *p, uint32_t scale, __vec4_i64 offsets, + __vec4_d val, __vec4_i1 mask) { + __scatter_base_offsets64_i64(p, scale, offsets, val, mask); } diff --git a/ispc.cpp b/ispc.cpp index 2b98de86..fac83bbe 100644 --- a/ispc.cpp +++ b/ispc.cpp @@ -94,20 +94,22 @@ lGetSystemISA() { int info[4]; __cpuid(info, 1); - if ((info[2] & (1 << 28)) != 0) { - // AVX1 for sure. Do we have AVX2? - // Call cpuid with eax=7, ecx=0 - __cpuidex(info, 7, 0); - if ((info[1] & (1 << 5)) != 0) - return "avx2"; - else { - // ivybridge? - if ((info[2] & (1 << 29)) != 0 && // F16C - (info[2] & (1 << 30)) != 0) // RDRAND - return "avx1.1"; + if ((info[2] & (1 << 28)) != 0) { // AVX + // AVX1 for sure.... + // Ivy Bridge? + if ((info[2] & (1 << 29)) != 0 && // F16C + (info[2] & (1 << 30)) != 0) { // RDRAND + // So far, so good. AVX2? + // Call cpuid with eax=7, ecx=0 + int info2[4]; + __cpuidex(info2, 7, 0); + if ((info2[1] & (1 << 5)) != 0) + return "avx2"; else - return "avx"; + return "avx1.1"; } + // Regular AVX + return "avx"; } else if ((info[2] & (1 << 19)) != 0) return "sse4"; @@ -212,6 +214,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, // This is the case for most of them t->hasHalf = t->hasRand = t->hasTranscendentals = false; + t->hasGather = t->hasScatter = false; if (!strcasecmp(isa, "sse2")) { t->isa = Target::SSE2; @@ -253,6 +256,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-8")) { t->isa = Target::GENERIC; @@ -262,6 +266,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-16")) { t->isa = Target::GENERIC; @@ -271,6 +276,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-32")) { t->isa = Target::GENERIC; @@ -280,6 +286,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-64")) { t->isa = Target::GENERIC; @@ -289,6 +296,7 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskBitCount = 1; t->hasHalf = true; t->hasTranscendentals = true; + t->hasGather = t->hasScatter = true; } else if (!strcasecmp(isa, "generic-1")) { t->isa = Target::GENERIC; @@ -320,8 +328,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; t->maskingIsFree = false; t->maskBitCount = 32; +#if !defined(LLVM_3_0) + // LLVM 3.1+ only t->hasHalf = true; + #if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + #endif +#endif } else if (!strcasecmp(isa, "avx1.1-x2")) { t->isa = Target::AVX11; @@ -330,8 +344,14 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->attributes = "+avx,+popcnt,+cmov,+f16c,+rdrand"; t->maskingIsFree = false; t->maskBitCount = 32; +#if !defined(LLVM_3_0) + // LLVM 3.1+ only t->hasHalf = true; + #if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + #endif +#endif } #ifndef LLVM_3_0 else if (!strcasecmp(isa, "avx2")) { @@ -342,7 +362,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->maskBitCount = 32; t->hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + t->hasGather = true; +#endif } else if (!strcasecmp(isa, "avx2-x2")) { t->isa = Target::AVX2; @@ -352,7 +376,11 @@ Target::GetTarget(const char *arch, const char *cpu, const char *isa, t->maskingIsFree = false; t->maskBitCount = 32; t->hasHalf = true; +#if !defined(LLVM_3_1) + // LLVM 3.2+ only t->hasRand = true; + t->hasGather = true; +#endif } #endif // !LLVM_3_0 else { diff --git a/ispc.h b/ispc.h index 9632f514..66191844 100644 --- a/ispc.h +++ b/ispc.h @@ -252,9 +252,15 @@ struct Target { conversions. */ bool hasHalf; - /** Indicates whether there is an ISA random number instruciton. */ + /** Indicates whether there is an ISA random number instruction. */ bool hasRand; + /** Indicates whether the target has a native gather instruction */ + bool hasGather; + + /** Indicates whether the target has a native scatter instruction */ + bool hasScatter; + /** Indicates whether the target has support for transcendentals (beyond sqrt, which we assume that all of them handle). */ bool hasTranscendentals; diff --git a/opt.cpp b/opt.cpp index 1456dfd7..a623466b 100644 --- a/opt.cpp +++ b/opt.cpp @@ -225,7 +225,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, } -#if 0 static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::Value *arg2, llvm::Value *arg3, const char *name, @@ -234,7 +233,6 @@ lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, llvm::ArrayRef newArgArray(&args[0], &args[4]); return llvm::CallInst::Create(func, newArgArray, name, insertBefore); } -#endif static llvm::Instruction * lCallInst(llvm::Function *func, llvm::Value *arg0, llvm::Value *arg1, @@ -449,6 +447,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); } if (!g->opt.disableMaskAllOnOptimizations) { @@ -491,6 +490,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); if (g->opt.disableCoalescing == false && @@ -509,6 +509,7 @@ Optimize(llvm::Module *module, int optLevel) { if (g->opt.disableGatherScatterOptimizations == false && g->target.vectorWidth > 1) { + optPM.add(llvm::createInstructionCombiningPass()); optPM.add(CreateImproveMemoryOpsPass()); } @@ -1673,6 +1674,39 @@ lOffsets32BitSafe(llvm::Value **variableOffsetPtr, } +/** Check to see if the single offset vector can safely be represented with + 32-bit values. If so, return true and update the pointed-to + llvm::Value * to be the 32-bit equivalent. */ +static bool +lOffsets32BitSafe(llvm::Value **offsetPtr, + llvm::Instruction *insertBefore) { + llvm::Value *offset = *offsetPtr; + + if (offset->getType() == LLVMTypes::Int32VectorType) + return true; + + llvm::SExtInst *sext = llvm::dyn_cast(offset); + if (sext != NULL && + sext->getOperand(0)->getType() == LLVMTypes::Int32VectorType) { + // sext of a 32-bit vector -> the 32-bit vector is good + *offsetPtr = sext->getOperand(0); + return true; + } + else if (lVectorIs32BitInts(offset)) { + // The only constant vector we should have here is a vector of + // all zeros (i.e. a ConstantAggregateZero, but just in case, + // do the more general check with lVectorIs32BitInts(). + *offsetPtr = + new llvm::TruncInst(offset, LLVMTypes::Int32VectorType, + LLVMGetName(offset, "_trunc"), + insertBefore); + return true; + } + else + return false; +} + + static bool lGSToGSBaseOffsets(llvm::CallInst *callInst) { struct GSInfo { @@ -1689,57 +1723,153 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { }; GSInfo gsFuncs[] = { - GSInfo("__pseudo_gather32_i8", "__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather32_i16", "__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather32_i32", "__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather32_float", "__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather32_i64", "__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather32_double", "__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter32_i8", "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter32_i16", "__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter32_i32", "__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter32_float", "__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter32_i64", "__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter32_double", "__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), - GSInfo("__pseudo_gather64_i8", "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSInfo("__pseudo_gather64_i16", "__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSInfo("__pseudo_gather64_i32", "__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSInfo("__pseudo_gather64_float", "__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSInfo("__pseudo_gather64_i64", "__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSInfo("__pseudo_gather64_double", "__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), + GSInfo("__pseudo_gather64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSInfo("__pseudo_gather64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSInfo("__pseudo_gather64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSInfo("__pseudo_gather64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSInfo("__pseudo_gather64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSInfo("__pseudo_gather64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSInfo("__pseudo_scatter64_i8", "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSInfo("__pseudo_scatter64_i16", "__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSInfo("__pseudo_scatter64_i32", "__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSInfo("__pseudo_scatter64_float", "__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSInfo("__pseudo_scatter64_i64", "__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSInfo("__pseudo_scatter64_double", "__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSInfo("__pseudo_scatter64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSInfo("__pseudo_scatter64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSInfo("__pseudo_scatter64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSInfo("__pseudo_scatter64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSInfo("__pseudo_scatter64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSInfo("__pseudo_scatter64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1771,25 +1901,6 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { // to the next instruction... return false; - // Try to decompose the offset vector into a compile time constant - // component and a varying component. The constant component is - // passed as a separate parameter to the gather/scatter functions, - // which in turn allows their implementations to end up emitting - // x86 instructions with constant offsets encoded in them. - llvm::Value *constOffset, *variableOffset; - lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, - callInst); - if (constOffset == NULL) - constOffset = LLVMIntAsType(0, offsetVector->getType()); - if (variableOffset == NULL) - variableOffset = LLVMIntAsType(0, offsetVector->getType()); - - // See if the varying component is scaled by 2, 4, or 8. If so, - // extract that scale factor and rewrite variableOffset to remove - // it. (This also is pulled out so that we can match the scales by - // 2/4/8 offered by x86 addressing operators.) - llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); - // Cast the base pointer to a void *, since that's what the // __pseudo_*_base_offsets_* functions want. basePtr = new llvm::IntToPtrInst(basePtr, LLVMTypes::VoidPointerType, @@ -1798,43 +1909,107 @@ lGSToGSBaseOffsets(llvm::CallInst *callInst) { llvm::Function *gatherScatterFunc = info->baseOffsetsFunc; - // If we're doing 32-bit addressing on a 64-bit target, here we - // will see if we can call one of the 32-bit variants of the pseudo - // gather/scatter functions. - if (g->opt.force32BitAddressing && - lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { - gatherScatterFunc = info->baseOffsets32Func; - } + if ((info->isGather == true && g->target.hasGather) || + (info->isGather == false && g->target.hasScatter)) { + // See if the offsets are scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite the offsets to remove + // it. + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&offsetVector); - if (info->isGather) { - llvm::Value *mask = callInst->getArgOperand(1); + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&offsetVector, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } - // Generate a new function call to the next pseudo gather - // base+offsets instruction. Note that we're passing a NULL - // llvm::Instruction to llvm::CallInst::Create; this means that - // the instruction isn't inserted into a basic block and that - // way we can then call ReplaceInstWithInst(). - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, mask, callInst->getName().str().c_str(), - NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, offsetVector, + mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, offsetScale, + offsetVector, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } else { - llvm::Value *storeValue = callInst->getArgOperand(1); - llvm::Value *mask = callInst->getArgOperand(2); + // Try to decompose the offset vector into a compile time constant + // component and a varying component. The constant component is + // passed as a separate parameter to the gather/scatter functions, + // which in turn allows their implementations to end up emitting + // x86 instructions with constant offsets encoded in them. + llvm::Value *constOffset, *variableOffset; + lExtractConstantOffset(offsetVector, &constOffset, &variableOffset, + callInst); + if (constOffset == NULL) + constOffset = LLVMIntAsType(0, offsetVector->getType()); + if (variableOffset == NULL) + variableOffset = LLVMIntAsType(0, offsetVector->getType()); - // Generate a new function call to the next pseudo scatter - // base+offsets instruction. See above for why passing NULL - // for the Instruction * is intended. - llvm::Instruction *newCall = - lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, - constOffset, storeValue, mask, "", NULL); - lCopyMetadata(newCall, callInst); - llvm::ReplaceInstWithInst(callInst, newCall); + // See if the varying component is scaled by 2, 4, or 8. If so, + // extract that scale factor and rewrite variableOffset to remove + // it. (This also is pulled out so that we can match the scales by + // 2/4/8 offered by x86 addressing operators.) + llvm::Value *offsetScale = lExtractOffsetVector248Scale(&variableOffset); + + // If we're doing 32-bit addressing on a 64-bit target, here we + // will see if we can call one of the 32-bit variants of the pseudo + // gather/scatter functions. + if (g->opt.force32BitAddressing && + lOffsets32BitSafe(&variableOffset, &constOffset, callInst)) { + gatherScatterFunc = info->baseOffsets32Func; + } + + if (info->isGather) { + llvm::Value *mask = callInst->getArgOperand(1); + + // Generate a new function call to the next pseudo gather + // base+offsets instruction. Note that we're passing a NULL + // llvm::Instruction to llvm::CallInst::Create; this means that + // the instruction isn't inserted into a basic block and that + // way we can then call ReplaceInstWithInst(). + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, mask, callInst->getName().str().c_str(), + NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } + else { + llvm::Value *storeValue = callInst->getArgOperand(1); + llvm::Value *mask = callInst->getArgOperand(2); + + // Generate a new function call to the next pseudo scatter + // base+offsets instruction. See above for why passing NULL + // for the Instruction * is intended. + llvm::Instruction *newCall = + lCallInst(gatherScatterFunc, basePtr, variableOffset, offsetScale, + constOffset, storeValue, mask, "", NULL); + lCopyMetadata(newCall, callInst); + llvm::ReplaceInstWithInst(callInst, newCall); + } } - return true; } @@ -1858,57 +2033,67 @@ lGSBaseOffsetsGetMoreConst(llvm::CallInst *callInst) { }; GSBOInfo gsFuncs[] = { - GSBOInfo("__pseudo_gather_base_offsets32_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets32_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets32_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets32_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets32_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets32_double", - "__pseudo_gather_base_offsets32_double", true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + true), + GSBOInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + true), - GSBOInfo( "__pseudo_scatter_base_offsets32_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets32_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets32_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets32_double", - "__pseudo_scatter_base_offsets32_double", false), - - GSBOInfo( "__pseudo_gather_base_offsets64_i8", - "__pseudo_gather_base_offsets32_i8", true), - GSBOInfo("__pseudo_gather_base_offsets64_i16", - "__pseudo_gather_base_offsets32_i16", true), - GSBOInfo("__pseudo_gather_base_offsets64_i32", - "__pseudo_gather_base_offsets32_i32", true), - GSBOInfo("__pseudo_gather_base_offsets64_float", - "__pseudo_gather_base_offsets32_float", true), - GSBOInfo("__pseudo_gather_base_offsets64_i64", - "__pseudo_gather_base_offsets32_i64", true), - GSBOInfo("__pseudo_gather_base_offsets64_double", - "__pseudo_gather_base_offsets32_double", true), - - GSBOInfo( "__pseudo_scatter_base_offsets64_i8", - "__pseudo_scatter_base_offsets32_i8", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i16", - "__pseudo_scatter_base_offsets32_i16", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i32", - "__pseudo_scatter_base_offsets32_i32", false), - GSBOInfo("__pseudo_scatter_base_offsets64_float", - "__pseudo_scatter_base_offsets32_float", false), - GSBOInfo("__pseudo_scatter_base_offsets64_i64", - "__pseudo_scatter_base_offsets32_i64", false), - GSBOInfo("__pseudo_scatter_base_offsets64_double", - "__pseudo_scatter_base_offsets32_double", false), + GSBOInfo( g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + false), + GSBOInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + false), }; int numGSFuncs = sizeof(gsFuncs) / sizeof(gsFuncs[0]); @@ -1991,6 +2176,26 @@ lComputeCommonPointer(llvm::Value *base, llvm::Value *offsets, } +static llvm::Constant * +lGetOffsetScaleVec(llvm::Value *offsetScale, llvm::Type *vecType) { + llvm::ConstantInt *offsetScaleInt = + llvm::dyn_cast(offsetScale); + Assert(offsetScaleInt != NULL); + uint64_t scaleValue = offsetScaleInt->getZExtValue(); + + std::vector scales; + for (int i = 0; i < g->target.vectorWidth; ++i) { + if (vecType == LLVMTypes::Int64VectorType) + scales.push_back(LLVMInt64(scaleValue)); + else { + Assert(vecType == LLVMTypes::Int32VectorType); + scales.push_back(LLVMInt32((int32_t)scaleValue)); + } + } + return llvm::ConstantVector::get(scales); +} + + /** After earlier optimization passes have run, we are sometimes able to determine that gathers/scatters are actually accessing memory in a more regular fashion and then change the operation to something simpler and @@ -2011,7 +2216,7 @@ lGSToLoadStore(llvm::CallInst *callInst) { struct GatherImpInfo { GatherImpInfo(const char *pName, const char *lmName, llvm::Type *st, int a) - : align(a) { + : align(a), isFactored(!g->target.hasGather) { pseudoFunc = m->module->getFunction(pName); loadMaskedFunc = m->module->getFunction(lmName); Assert(pseudoFunc != NULL && loadMaskedFunc != NULL); @@ -2022,39 +2227,52 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *loadMaskedFunc; llvm::Type *scalarType; const int align; + const bool isFactored; }; GatherImpInfo gInfo[] = { - GatherImpInfo("__pseudo_gather_base_offsets32_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets32_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets32_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets32_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets32_double", "__masked_load_double", - LLVMTypes::DoubleType, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_i8", "__masked_load_i8", - LLVMTypes::Int8Type, 1), - GatherImpInfo("__pseudo_gather_base_offsets64_i16", "__masked_load_i16", - LLVMTypes::Int16Type, 2), - GatherImpInfo("__pseudo_gather_base_offsets64_i32", "__masked_load_i32", - LLVMTypes::Int32Type, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_float", "__masked_load_float", - LLVMTypes::FloatType, 4), - GatherImpInfo("__pseudo_gather_base_offsets64_i64", "__masked_load_i64", - LLVMTypes::Int64Type, 8), - GatherImpInfo("__pseudo_gather_base_offsets64_double", "__masked_load_double", - LLVMTypes::DoubleType, 8) + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i8" : + "__pseudo_gather_factored_base_offsets32_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i16" : + "__pseudo_gather_factored_base_offsets32_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i32" : + "__pseudo_gather_factored_base_offsets32_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_float" : + "__pseudo_gather_factored_base_offsets32_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_i64" : + "__pseudo_gather_factored_base_offsets32_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets32_double" : + "__pseudo_gather_factored_base_offsets32_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i8" : + "__pseudo_gather_factored_base_offsets64_i8", + "__masked_load_i8", LLVMTypes::Int8Type, 1), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i16" : + "__pseudo_gather_factored_base_offsets64_i16", + "__masked_load_i16", LLVMTypes::Int16Type, 2), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i32" : + "__pseudo_gather_factored_base_offsets64_i32", + "__masked_load_i32", LLVMTypes::Int32Type, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_float" : + "__pseudo_gather_factored_base_offsets64_float", + "__masked_load_float", LLVMTypes::FloatType, 4), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_i64" : + "__pseudo_gather_factored_base_offsets64_i64", + "__masked_load_i64", LLVMTypes::Int64Type, 8), + GatherImpInfo(g->target.hasGather ? "__pseudo_gather_base_offsets64_double" : + "__pseudo_gather_factored_base_offsets64_double", + "__masked_load_double", LLVMTypes::DoubleType, 8), }; struct ScatterImpInfo { ScatterImpInfo(const char *pName, const char *msName, llvm::Type *vpt, int a) - : align(a) { + : align(a), isFactored(!g->target.hasScatter) { pseudoFunc = m->module->getFunction(pName); maskedStoreFunc = m->module->getFunction(msName); vecPtrType = vpt; @@ -2064,33 +2282,46 @@ lGSToLoadStore(llvm::CallInst *callInst) { llvm::Function *maskedStoreFunc; llvm::Type *vecPtrType; const int align; + const bool isFactored; }; ScatterImpInfo sInfo[] = { - ScatterImpInfo("__pseudo_scatter_base_offsets32_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets32_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets32_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i8", "__pseudo_masked_store_i8", - LLVMTypes::Int8VectorPointerType, 1), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i16", "__pseudo_masked_store_i16", - LLVMTypes::Int16VectorPointerType, 2), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i32", "__pseudo_masked_store_i32", - LLVMTypes::Int32VectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_float", "__pseudo_masked_store_float", - LLVMTypes::FloatVectorPointerType, 4), - ScatterImpInfo("__pseudo_scatter_base_offsets64_i64", "__pseudo_masked_store_i64", - LLVMTypes::Int64VectorPointerType, 8), - ScatterImpInfo("__pseudo_scatter_base_offsets64_double", "__pseudo_masked_store_double", - LLVMTypes::DoubleVectorPointerType, 8) + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i8" : + "__pseudo_scatter_factored_base_offsets32_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i16" : + "__pseudo_scatter_factored_base_offsets32_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i32" : + "__pseudo_scatter_factored_base_offsets32_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_float" : + "__pseudo_scatter_factored_base_offsets32_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_i64" : + "__pseudo_scatter_factored_base_offsets32_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets32_double" : + "__pseudo_scatter_factored_base_offsets32_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i8" : + "__pseudo_scatter_factored_base_offsets64_i8", + "__pseudo_masked_store_i8", LLVMTypes::Int8VectorPointerType, 1), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i16" : + "__pseudo_scatter_factored_base_offsets64_i16", + "__pseudo_masked_store_i16", LLVMTypes::Int16VectorPointerType, 2), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i32" : + "__pseudo_scatter_factored_base_offsets64_i32", + "__pseudo_masked_store_i32", LLVMTypes::Int32VectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_float" : + "__pseudo_scatter_factored_base_offsets64_float", + "__pseudo_masked_store_float", LLVMTypes::FloatVectorPointerType, 4), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_i64" : + "__pseudo_scatter_factored_base_offsets64_i64", + "__pseudo_masked_store_i64", LLVMTypes::Int64VectorPointerType, 8), + ScatterImpInfo(g->target.hasScatter ? "__pseudo_scatter_base_offsets64_double" : + "__pseudo_scatter_factored_base_offsets64_double", + "__pseudo_masked_store_double", LLVMTypes::DoubleVectorPointerType, 8), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -2118,34 +2349,45 @@ lGSToLoadStore(llvm::CallInst *callInst) { lGetSourcePosFromMetadata(callInst, &pos); llvm::Value *base = callInst->getArgOperand(0); - llvm::Value *varyingOffsets = callInst->getArgOperand(1); - llvm::Value *offsetScale = callInst->getArgOperand(2); - llvm::Value *constOffsets = callInst->getArgOperand(3); - llvm::Value *storeValue = (scatterInfo != NULL) ? callInst->getArgOperand(4) : NULL; - llvm::Value *mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); + llvm::Value *fullOffsets = NULL; + llvm::Value *storeValue = NULL; + llvm::Value *mask = NULL; - // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets - llvm::ConstantInt *offsetScaleInt = - llvm::dyn_cast(offsetScale); - Assert(offsetScaleInt != NULL); - uint64_t scaleValue = offsetScaleInt->getZExtValue(); + if ((gatherInfo != NULL && gatherInfo->isFactored) || + (scatterInfo != NULL && scatterInfo->isFactored)) { + llvm::Value *varyingOffsets = callInst->getArgOperand(1); + llvm::Value *offsetScale = callInst->getArgOperand(2); + llvm::Value *constOffsets = callInst->getArgOperand(3); + if (scatterInfo) + storeValue = callInst->getArgOperand(4); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 4 : 5); - std::vector scales; - for (int i = 0; i < g->target.vectorWidth; ++i) { - if (varyingOffsets->getType() == LLVMTypes::Int64VectorType) - scales.push_back(LLVMInt64(scaleValue)); - else - scales.push_back(LLVMInt32((int32_t)scaleValue)); + // Compute the full offset vector: offsetScale * varyingOffsets + constOffsets + llvm::Constant *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, varyingOffsets->getType()); + + llvm::Value *scaledVarying = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + varyingOffsets, "scaled_varying", callInst); + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, + constOffsets, "varying+const_offsets", + callInst); } - llvm::Constant *offsetScaleVec = llvm::ConstantVector::get(scales); + else { + if (scatterInfo) + storeValue = callInst->getArgOperand(3); + mask = callInst->getArgOperand((gatherInfo != NULL) ? 3 : 4); - llvm::Value *scaledVarying = - llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, - varyingOffsets, "scaled_varying", callInst); - llvm::Value *fullOffsets = - llvm::BinaryOperator::Create(llvm::Instruction::Add, scaledVarying, - constOffsets, "varying+const_offsets", - callInst); + llvm::Value *offsetScale = callInst->getArgOperand(1); + llvm::Value *offsets = callInst->getArgOperand(2); + llvm::Value *offsetScaleVec = + lGetOffsetScaleVec(offsetScale, offsets->getType()); + + fullOffsets = + llvm::BinaryOperator::Create(llvm::Instruction::Mul, offsetScaleVec, + offsets, "scaled_offsets", callInst); + } Debug(SourcePos(), "GSToLoadStore: %s.", fullOffsets->getName().str().c_str()); @@ -3354,10 +3596,10 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { DEBUG_START_PASS("GatherCoalescePass"); llvm::Function *gatherFuncs[] = { - m->module->getFunction("__pseudo_gather_base_offsets32_i32"), - m->module->getFunction("__pseudo_gather_base_offsets32_float"), - m->module->getFunction("__pseudo_gather_base_offsets64_i32"), - m->module->getFunction("__pseudo_gather_base_offsets64_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets32_float"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_i32"), + m->module->getFunction("__pseudo_gather_factored_base_offsets64_float"), }; int nGatherFuncs = sizeof(gatherFuncs) / sizeof(gatherFuncs[0]); @@ -3367,7 +3609,7 @@ GatherCoalescePass::runOnBasicBlock(llvm::BasicBlock &bb) { for (llvm::BasicBlock::iterator iter = bb.begin(), e = bb.end(); iter != e; ++iter) { // Iterate over all of the instructions and look for calls to - // __pseudo_gather_base_offsets{32,64}_{i32,float} calls. + // __pseudo_gather_factored_base_offsets{32,64}_{i32,float} calls. llvm::CallInst *callInst = llvm::dyn_cast(&*iter); if (callInst == NULL) continue; @@ -3631,7 +3873,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { : isGather(ig) { pseudoFunc = m->module->getFunction(pName); actualFunc = m->module->getFunction(aName); - Assert(pseudoFunc != NULL && actualFunc != NULL); } llvm::Function *pseudoFunc; llvm::Function *actualFunc; @@ -3639,20 +3880,6 @@ lReplacePseudoGS(llvm::CallInst *callInst) { }; LowerGSInfo lgsInfo[] = { - LowerGSInfo("__pseudo_gather_base_offsets32_i8", "__gather_base_offsets32_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i16", "__gather_base_offsets32_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i32", "__gather_base_offsets32_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets32_float", "__gather_base_offsets32_float", true), - LowerGSInfo("__pseudo_gather_base_offsets32_i64", "__gather_base_offsets32_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets32_double", "__gather_base_offsets32_double", true), - - LowerGSInfo("__pseudo_gather_base_offsets64_i8", "__gather_base_offsets64_i8", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i16", "__gather_base_offsets64_i16", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i32", "__gather_base_offsets64_i32", true), - LowerGSInfo("__pseudo_gather_base_offsets64_float", "__gather_base_offsets64_float", true), - LowerGSInfo("__pseudo_gather_base_offsets64_i64", "__gather_base_offsets64_i64", true), - LowerGSInfo("__pseudo_gather_base_offsets64_double", "__gather_base_offsets64_double", true), - LowerGSInfo("__pseudo_gather32_i8", "__gather32_i8", true), LowerGSInfo("__pseudo_gather32_i16", "__gather32_i16", true), LowerGSInfo("__pseudo_gather32_i32", "__gather32_i32", true), @@ -3667,19 +3894,57 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_gather64_i64", "__gather64_i64", true), LowerGSInfo("__pseudo_gather64_double", "__gather64_double", true), - LowerGSInfo("__pseudo_scatter_base_offsets32_i8", "__scatter_base_offsets32_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i16", "__scatter_base_offsets32_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i32", "__scatter_base_offsets32_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_float", "__scatter_base_offsets32_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_i64", "__scatter_base_offsets32_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets32_double", "__scatter_base_offsets32_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i8", + "__gather_factored_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i32", + "__gather_factored_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_float", + "__gather_factored_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets32_double", + "__gather_factored_base_offsets32_double", true), - LowerGSInfo("__pseudo_scatter_base_offsets64_i8", "__scatter_base_offsets64_i8", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i16", "__scatter_base_offsets64_i16", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i32", "__scatter_base_offsets64_i32", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_float", "__scatter_base_offsets64_float", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_i64", "__scatter_base_offsets64_i64", false), - LowerGSInfo("__pseudo_scatter_base_offsets64_double", "__scatter_base_offsets64_double", false), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i8", + "__gather_factored_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i32", + "__gather_factored_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_float", + "__gather_factored_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_factored_base_offsets64_double", + "__gather_factored_base_offsets64_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets32_i8", + "__gather_base_offsets32_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i16", + "__gather_base_offsets32_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i32", + "__gather_base_offsets32_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets32_float", + "__gather_base_offsets32_float", true), + LowerGSInfo("__pseudo_gather_base_offsets32_i64", + "__gather_base_offsets32_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets32_double", + "__gather_base_offsets32_double", true), + + LowerGSInfo("__pseudo_gather_base_offsets64_i8", + "__gather_base_offsets64_i8", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i16", + "__gather_base_offsets64_i16", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i32", + "__gather_base_offsets64_i32", true), + LowerGSInfo("__pseudo_gather_base_offsets64_float", + "__gather_base_offsets64_float", true), + LowerGSInfo("__pseudo_gather_base_offsets64_i64", + "__gather_base_offsets64_i64", true), + LowerGSInfo("__pseudo_gather_base_offsets64_double", + "__gather_base_offsets64_double", true), LowerGSInfo("__pseudo_scatter32_i8", "__scatter32_i8", false), LowerGSInfo("__pseudo_scatter32_i16", "__scatter32_i16", false), @@ -3694,6 +3959,59 @@ lReplacePseudoGS(llvm::CallInst *callInst) { LowerGSInfo("__pseudo_scatter64_float", "__scatter64_float", false), LowerGSInfo("__pseudo_scatter64_i64", "__scatter64_i64", false), LowerGSInfo("__pseudo_scatter64_double", "__scatter64_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i8", + "__scatter_factored_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i32", + "__scatter_factored_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_float", + "__scatter_factored_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i8", + "__scatter_factored_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i32", + "__scatter_factored_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_float", + "__scatter_factored_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_factored_base_offsets64_double", + "__scatter_factored_base_offsets64_double", false), + + + LowerGSInfo("__pseudo_scatter_base_offsets32_i8", + "__scatter_base_offsets32_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i16", + "__scatter_base_offsets32_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i32", + "__scatter_base_offsets32_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_float", + "__scatter_base_offsets32_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_i64", + "__scatter_base_offsets32_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets32_double", + "__scatter_base_offsets32_double", false), + + LowerGSInfo("__pseudo_scatter_base_offsets64_i8", + "__scatter_base_offsets64_i8", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i16", + "__scatter_base_offsets64_i16", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i32", + "__scatter_base_offsets64_i32", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_float", + "__scatter_base_offsets64_float", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_i64", + "__scatter_base_offsets64_i64", false), + LowerGSInfo("__pseudo_scatter_base_offsets64_double", + "__scatter_base_offsets64_double", false), }; llvm::Function *calledFunc = callInst->getCalledFunction(); @@ -3709,6 +4027,7 @@ lReplacePseudoGS(llvm::CallInst *callInst) { if (info == NULL) return false; + Assert(info->actualFunc != NULL); // Get the source position from the metadata attached to the call // instruction so that we can issue PerformanceWarning()s below. @@ -3899,6 +4218,12 @@ bool MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { const char *names[] = { "__fast_masked_vload", + "__gather_factored_base_offsets32_i8", "__gather_factored_base_offsets32_i16", + "__gather_factored_base_offsets32_i32", "__gather_factored_base_offsets32_i64", + "__gather_factored_base_offsets32_float", "__gather_factored_base_offsets32_double", + "__gather_factored_base_offsets64_i8", "__gather_factored_base_offsets64_i16", + "__gather_factored_base_offsets64_i32", "__gather_factored_base_offsets64_i64", + "__gather_factored_base_offsets64_float", "__gather_factored_base_offsets64_double", "__gather_base_offsets32_i8", "__gather_base_offsets32_i16", "__gather_base_offsets32_i32", "__gather_base_offsets32_i64", "__gather_base_offsets32_float", "__gather_base_offsets32_double", @@ -3926,6 +4251,12 @@ MakeInternalFuncsStaticPass::runOnModule(llvm::Module &module) { "__masked_store_blend_i8", "__masked_store_blend_i16", "__masked_store_blend_i32", "__masked_store_blend_i64", "__masked_store_blend_float", "__masked_store_blend_double", + "__scatter_factored_base_offsets32_i8", "__scatter_factored_base_offsets32_i16", + "__scatter_factored_base_offsets32_i32", "__scatter_factored_base_offsets32_i64", + "__scatter_factored_base_offsets32_float", "__scatter_factored_base_offsets32_double", + "__scatter_factored_base_offsets64_i8", "__scatter_factored_base_offsets64_i16", + "__scatter_factored_base_offsets64_i32", "__scatter_factored_base_offsets64_i64", + "__scatter_factored_base_offsets64_float", "__scatter_factored_base_offsets64_double", "__scatter_base_offsets32_i8", "__scatter_base_offsets32_i16", "__scatter_base_offsets32_i32", "__scatter_base_offsets32_i64", "__scatter_base_offsets32_float", "__scatter_base_offsets32_double", diff --git a/run_tests.py b/run_tests.py index 0cb78ed6..ea53b432 100755 --- a/run_tests.py +++ b/run_tests.py @@ -265,11 +265,9 @@ def run_test(filename): gcc_arch = '-m64' gcc_isa="" - if options.target == 'sse2' or options.target == 'sse2-x2': - gcc_isa = '-msse3' - if options.target == 'sse4' or options.target == 'sse4-x2' or options.target == 'generic-4': + if options.target == 'generic-4': gcc_isa = '-msse4.2' - if options.target == 'avx' or options.target == 'avx-x2' or options.target == 'generic-8': + if options.target == 'generic-8': gcc_isa = '-mavx' if (options.target == 'generic-16' or options.target == 'generic-32' or options.target == 'generic-64') \ and (options.include_file.find("knc.h")!=-1 or options.include_file.find("knc2x.h")!=-1): diff --git a/stdlib.ispc b/stdlib.ispc index a7499930..3774c4a4 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -4068,3 +4068,188 @@ static inline void seed_rng(uniform RNGState * uniform state, static inline void fastmath() { __fastmath(); } + +/////////////////////////////////////////////////////////////////////////// +// rdrand + +static inline uniform bool rdrand(float * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + uniform int32 irand; + uniform bool success = __rdrand_i32(&irand); + if (success) { + irand &= (1<<23)-1; + *ptr = floatbits(0x3F800000 | irand)-1.0f; + } + return success; + } +} + +static inline bool rdrand(varying float * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + // FIXME: it probably would be preferable, here and in the + // following rdrand() function, to do the int->float stuff + // in vector form. However, we need to be careful to not + // clobber any existing already-set values in *ptr with + // inactive lanes here... + irand &= (1<<23)-1; + *ptr = floatbits(0x3F800000 | irand)-1.0f; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(float * ptr) { + if (__have_native_rand == false) + return false; + else { + float * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + irand &= (1<<23)-1; + *ptrs[index] = floatbits(0x3F800000 | irand)-1.0f; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int16 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i16(ptr); +} + +static inline bool rdrand(varying int16 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int16 irand; + if (__rdrand_i16(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int16 * ptr) { + if (__have_native_rand == false) + return false; + else { + int16 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int16 irand; + if (__rdrand_i16(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int32 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i32(ptr); +} + +static inline bool rdrand(varying int32 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int32 * ptr) { + if (__have_native_rand == false) + return false; + else { + int32 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int32 irand; + if (__rdrand_i32(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} + +static inline uniform bool rdrand(int64 * uniform ptr) { + if (__have_native_rand == false) + return false; + else + return __rdrand_i64(ptr); +} + +static inline bool rdrand(varying int64 * uniform ptr) { + if (__have_native_rand == false) + return false; + else { + bool success = false; + foreach_active (index) { + uniform int64 irand; + if (__rdrand_i64(&irand)) { + *ptr = irand; + success = true; + } + } + return success; + } +} + +static inline bool rdrand(int64 * ptr) { + if (__have_native_rand == false) + return false; + else { + int64 * uniform ptrs[programCount]; + ptrs[programIndex] = ptr; + bool success = false; + + foreach_active (index) { + uniform int64 irand; + if (__rdrand_i64(&irand)) { + *ptrs[index] = irand; + success = true; + } + } + return success; + } +} diff --git a/tests/gather-double-1.ispc b/tests/gather-double-1.ispc new file mode 100644 index 00000000..64575545 --- /dev/null +++ b/tests/gather-double-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-2.ispc b/tests/gather-double-2.ispc new file mode 100644 index 00000000..78b9423a --- /dev/null +++ b/tests/gather-double-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-3.ispc b/tests/gather-double-3.ispc new file mode 100644 index 00000000..cfa32f21 --- /dev/null +++ b/tests/gather-double-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-4.ispc b/tests/gather-double-4.ispc new file mode 100644 index 00000000..d7ad2f5e --- /dev/null +++ b/tests/gather-double-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-5.ispc b/tests/gather-double-5.ispc new file mode 100644 index 00000000..3b97816a --- /dev/null +++ b/tests/gather-double-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-6.ispc b/tests/gather-double-6.ispc new file mode 100644 index 00000000..1c464bd5 --- /dev/null +++ b/tests/gather-double-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-double-7.ispc b/tests/gather-double-7.ispc new file mode 100644 index 00000000..c73f3b4e --- /dev/null +++ b/tests/gather-double-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-double-8.ispc b/tests/gather-double-8.ispc new file mode 100644 index 00000000..52da874d --- /dev/null +++ b/tests/gather-double-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform double a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + double *ptr = (aFOO[0] == 1234) ? (double * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-1.ispc b/tests/gather-float-1.ispc new file mode 100644 index 00000000..18b3fd98 --- /dev/null +++ b/tests/gather-float-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-2.ispc b/tests/gather-float-2.ispc new file mode 100644 index 00000000..4f680814 --- /dev/null +++ b/tests/gather-float-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-3.ispc b/tests/gather-float-3.ispc new file mode 100644 index 00000000..9e81cd06 --- /dev/null +++ b/tests/gather-float-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-4.ispc b/tests/gather-float-4.ispc new file mode 100644 index 00000000..4f114fee --- /dev/null +++ b/tests/gather-float-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-5.ispc b/tests/gather-float-5.ispc new file mode 100644 index 00000000..16f0e81e --- /dev/null +++ b/tests/gather-float-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-6.ispc b/tests/gather-float-6.ispc new file mode 100644 index 00000000..d1136f9a --- /dev/null +++ b/tests/gather-float-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-float-7.ispc b/tests/gather-float-7.ispc new file mode 100644 index 00000000..f5b09dc4 --- /dev/null +++ b/tests/gather-float-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-float-8.ispc b/tests/gather-float-8.ispc new file mode 100644 index 00000000..3708f063 --- /dev/null +++ b/tests/gather-float-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform float a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + float *ptr = (aFOO[0] == 1234) ? (float * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-1.ispc b/tests/gather-int16-1.ispc index e6bedd7f..89675185 100644 --- a/tests/gather-int16-1.ispc +++ b/tests/gather-int16-1.ispc @@ -1,19 +1,17 @@ + export uniform int width() { return programCount; } -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int16 x[programCount]; - x[programIndex] = programIndex; - int a = aFOO[programIndex]-1; - unsigned int16 v; - if (programIndex < 2) - v = x[a]; - else - v = 2; - RET[programIndex] = v; +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; } export void result(uniform float RET[]) { - RET[programIndex] = 2; - RET[0] = 0; - RET[1] = 1; + RET[programIndex] = 1 + programIndex; } diff --git a/tests/gather-int16-2.ispc b/tests/gather-int16-2.ispc new file mode 100644 index 00000000..74fdab8c --- /dev/null +++ b/tests/gather-int16-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-3.ispc b/tests/gather-int16-3.ispc new file mode 100644 index 00000000..a197f754 --- /dev/null +++ b/tests/gather-int16-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-4.ispc b/tests/gather-int16-4.ispc new file mode 100644 index 00000000..db9a7217 --- /dev/null +++ b/tests/gather-int16-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-5.ispc b/tests/gather-int16-5.ispc new file mode 100644 index 00000000..8d6ced77 --- /dev/null +++ b/tests/gather-int16-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-6.ispc b/tests/gather-int16-6.ispc new file mode 100644 index 00000000..8d740856 --- /dev/null +++ b/tests/gather-int16-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int16-7.ispc b/tests/gather-int16-7.ispc new file mode 100644 index 00000000..a6236af5 --- /dev/null +++ b/tests/gather-int16-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int16-8.ispc b/tests/gather-int16-8.ispc new file mode 100644 index 00000000..66bc8e89 --- /dev/null +++ b/tests/gather-int16-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int16 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int16 *ptr = (aFOO[0] == 1234) ? (int16 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-1.ispc b/tests/gather-int32-1.ispc new file mode 100644 index 00000000..2df1dd7e --- /dev/null +++ b/tests/gather-int32-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-2.ispc b/tests/gather-int32-2.ispc new file mode 100644 index 00000000..61f5a024 --- /dev/null +++ b/tests/gather-int32-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-3.ispc b/tests/gather-int32-3.ispc new file mode 100644 index 00000000..e87eab33 --- /dev/null +++ b/tests/gather-int32-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-4.ispc b/tests/gather-int32-4.ispc new file mode 100644 index 00000000..8a6d7bb6 --- /dev/null +++ b/tests/gather-int32-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-5.ispc b/tests/gather-int32-5.ispc new file mode 100644 index 00000000..573666c7 --- /dev/null +++ b/tests/gather-int32-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-6.ispc b/tests/gather-int32-6.ispc new file mode 100644 index 00000000..0d59a8fc --- /dev/null +++ b/tests/gather-int32-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int32-7.ispc b/tests/gather-int32-7.ispc new file mode 100644 index 00000000..ebc724e5 --- /dev/null +++ b/tests/gather-int32-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int32-8.ispc b/tests/gather-int32-8.ispc new file mode 100644 index 00000000..03cd7c8b --- /dev/null +++ b/tests/gather-int32-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int *ptr = (aFOO[0] == 1234) ? (int * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-1.ispc b/tests/gather-int64-1.ispc new file mode 100644 index 00000000..fe3d171b --- /dev/null +++ b/tests/gather-int64-1.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-2.ispc b/tests/gather-int64-2.ispc new file mode 100644 index 00000000..7a00439b --- /dev/null +++ b/tests/gather-int64-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-3.ispc b/tests/gather-int64-3.ispc new file mode 100644 index 00000000..7ddd559c --- /dev/null +++ b/tests/gather-int64-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-4.ispc b/tests/gather-int64-4.ispc new file mode 100644 index 00000000..92e004e3 --- /dev/null +++ b/tests/gather-int64-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-5.ispc b/tests/gather-int64-5.ispc new file mode 100644 index 00000000..76d95f2d --- /dev/null +++ b/tests/gather-int64-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-6.ispc b/tests/gather-int64-6.ispc new file mode 100644 index 00000000..9deaaa80 --- /dev/null +++ b/tests/gather-int64-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int64-7.ispc b/tests/gather-int64-7.ispc new file mode 100644 index 00000000..52df9d19 --- /dev/null +++ b/tests/gather-int64-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int64-8.ispc b/tests/gather-int64-8.ispc new file mode 100644 index 00000000..5cfa621b --- /dev/null +++ b/tests/gather-int64-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int64 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int64 *ptr = (aFOO[0] == 1234) ? (int64 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-1.ispc b/tests/gather-int8-1.ispc index 305b12ca..43961ff2 100644 --- a/tests/gather-int8-1.ispc +++ b/tests/gather-int8-1.ispc @@ -1,19 +1,17 @@ + export uniform int width() { return programCount; } -export void f_fu(uniform float RET[], uniform float aFOO[], uniform float b) { - uniform int8 x[programCount]; - x[programIndex] = programIndex; - int a = aFOO[programIndex]-1; - unsigned int8 v; - if (programIndex < 2) - v = x[a]; - else - v = 2; - RET[programIndex] = v; +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; } export void result(uniform float RET[]) { - RET[programIndex] = 2; - RET[0] = 0; - RET[1] = 1; + RET[programIndex] = 1 + programIndex; } diff --git a/tests/gather-int8-2.ispc b/tests/gather-int8-2.ispc new file mode 100644 index 00000000..8e853d0e --- /dev/null +++ b/tests/gather-int8-2.ispc @@ -0,0 +1,17 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-3.ispc b/tests/gather-int8-3.ispc new file mode 100644 index 00000000..5650ab7f --- /dev/null +++ b/tests/gather-int8-3.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-4.ispc b/tests/gather-int8-4.ispc new file mode 100644 index 00000000..92386d5a --- /dev/null +++ b/tests/gather-int8-4.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + if (programIndex < 2) + g = a[programIndex+zero]; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-5.ispc b/tests/gather-int8-5.ispc new file mode 100644 index 00000000..d0440d77 --- /dev/null +++ b/tests/gather-int8-5.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-6.ispc b/tests/gather-int8-6.ispc new file mode 100644 index 00000000..840b309c --- /dev/null +++ b/tests/gather-int8-6.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + int g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1 + programIndex; +} diff --git a/tests/gather-int8-7.ispc b/tests/gather-int8-7.ispc new file mode 100644 index 00000000..c0190db0 --- /dev/null +++ b/tests/gather-int8-7.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/gather-int8-8.ispc b/tests/gather-int8-8.ispc new file mode 100644 index 00000000..3c5cd41e --- /dev/null +++ b/tests/gather-int8-8.ispc @@ -0,0 +1,23 @@ + +export uniform int width() { return programCount; } + +int64 zero = 0; +void *gptr; + +export void f_f(uniform float RET[], uniform float aFOO[]) { + uniform int8 a[programCount]; + for (uniform int i = 0; i < programCount; ++i) + a[i] = aFOO[i]; + + int g = 0; + int8 *ptr = (aFOO[0] == 1234) ? (int8 * varying)gptr : (a + programIndex); + if (programIndex < 2) + g = *ptr; + RET[programIndex] = g; +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; + RET[0] = 1; + RET[1] = 2; +} diff --git a/tests/rdrand-1.ispc b/tests/rdrand-1.ispc new file mode 100644 index 00000000..53ca6121 --- /dev/null +++ b/tests/rdrand-1.ispc @@ -0,0 +1,21 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + uniform float r = -1; + uniform int count = 0; + while (!rdrand(&r)) { + ++count; + } + RET[programIndex] = (r >= 0 && r < 1); + +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-2.ispc b/tests/rdrand-2.ispc new file mode 100644 index 00000000..7021a271 --- /dev/null +++ b/tests/rdrand-2.ispc @@ -0,0 +1,19 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + float r = -1; + while (!rdrand(&r)) + ; + RET[programIndex] = (r >= 0 && r < 1); + +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-3.ispc b/tests/rdrand-3.ispc new file mode 100644 index 00000000..a9fc93a3 --- /dev/null +++ b/tests/rdrand-3.ispc @@ -0,0 +1,25 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 1; +#else + + int lessHalf = 0, moreHalf = 0; + for (uniform int i = 0; i < 1024*1024; ++i) { + float r = -1; + while (!rdrand(&r)) + ; + if (r < 0.5) ++lessHalf; + else ++moreHalf; + } + + float r = (double)lessHalf / (double)(lessHalf + moreHalf); + RET[programIndex] = (r >= .49 && r < .51); +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 1; +} diff --git a/tests/rdrand-4.ispc b/tests/rdrand-4.ispc new file mode 100644 index 00000000..3b38b7b1 --- /dev/null +++ b/tests/rdrand-4.ispc @@ -0,0 +1,33 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + uniform int set[64] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + uniform int64 r; + while (!rdrand(&r)) + ; + for (uniform int b = 0; b < 64; ++b) + if (((unsigned int64)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 64; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/rdrand-5.ispc b/tests/rdrand-5.ispc new file mode 100644 index 00000000..cbf59a97 --- /dev/null +++ b/tests/rdrand-5.ispc @@ -0,0 +1,33 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + int set[32] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + int32 r; + while (!rdrand(&r)) + ; + for (uniform int b = 0; b < 32; ++b) + if (((unsigned int32)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 32; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +} diff --git a/tests/rdrand-6.ispc b/tests/rdrand-6.ispc new file mode 100644 index 00000000..93137625 --- /dev/null +++ b/tests/rdrand-6.ispc @@ -0,0 +1,35 @@ + +export uniform int width() { return programCount; } + +export void f_f(uniform float RET[], uniform float aFOO[]) { +#if !defined(ISPC_TARGET_AVX11) && !defined(ISPC_TARGET_AVX2) + RET[programIndex] = 0; +#else + + int set[32] = { 0 }; + uniform int count = 1024*1024; + for (uniform int i = 0; i < count; ++i) { + uniform int32 rr[programCount]; + int * ptr = rr + programIndex; + while (!rdrand(ptr)) + ; + int32 r = rr[programIndex]; + for (uniform int b = 0; b < 32; ++b) + if (((unsigned int32)r >> b) & 1) + ++set[b]; + } + + RET[programIndex] = 0; + for (uniform int b = 0; b < 32; ++b) { + float r = (double)set[b] / (double)(count); + if (!(r >= .49 && r < .51)) { + print("% % - %\n", b, r, set[b]); + ++RET[programIndex]; + } + } +#endif +} + +export void result(uniform float RET[]) { + RET[programIndex] = 0; +}