From f75c94a8f1db5d7ce612328d15bd3a9d55663a46 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 4 Jan 2012 12:16:41 -0800 Subject: [PATCH] Have aos/soa and broadcast/shuffle/rotate functions provided by the target. The SSE/AVX targets use the old versions from util.m4, but these functions are now passed through to the generic targets. --- builtins.cpp | 32 +- builtins/target-avx-common.ll | 2 + builtins/target-generic-common.ll | 73 +++- builtins/target-sse2-common.ll | 2 + builtins/target-sse4-common.ll | 2 + builtins/util.m4 | 591 ++++++++++++++---------------- stdlib.ispc | 44 ++- 7 files changed, 376 insertions(+), 370 deletions(-) diff --git a/builtins.cpp b/builtins.cpp index 9bd41e8f..dce7c9fa 100644 --- a/builtins.cpp +++ b/builtins.cpp @@ -378,10 +378,10 @@ lSetInternalFunctions(llvm::Module *module) { "__atomic_xor_uniform_int64_global", "__broadcast_double", "__broadcast_float", - "__broadcast_int16", - "__broadcast_int32", - "__broadcast_int64", - "__broadcast_int8", + "__broadcast_i16", + "__broadcast_i32", + "__broadcast_i64", + "__broadcast_i8", "__ceil_uniform_double", "__ceil_uniform_float", "__ceil_varying_double", @@ -483,10 +483,10 @@ lSetInternalFunctions(llvm::Module *module) { "__reduce_min_uint64", "__rotate_double", "__rotate_float", - "__rotate_int16", - "__rotate_int32", - "__rotate_int64", - "__rotate_int8", + "__rotate_i16", + "__rotate_i32", + "__rotate_i64", + "__rotate_i8", "__round_uniform_double", "__round_uniform_float", "__round_varying_double", @@ -497,16 +497,16 @@ lSetInternalFunctions(llvm::Module *module) { "__sext_varying_bool", "__shuffle2_double", "__shuffle2_float", - "__shuffle2_int16", - "__shuffle2_int32", - "__shuffle2_int64", - "__shuffle2_int8", + "__shuffle2_i16", + "__shuffle2_i32", + "__shuffle2_i64", + "__shuffle2_i8", "__shuffle_double", "__shuffle_float", - "__shuffle_int16", - "__shuffle_int32", - "__shuffle_int64", - "__shuffle_int8", + "__shuffle_i16", + "__shuffle_i32", + "__shuffle_i64", + "__shuffle_i8", "__soa_to_aos3_float", "__soa_to_aos3_float16", "__soa_to_aos3_float4", diff --git a/builtins/target-avx-common.ll b/builtins/target-avx-common.ll index 07fb12b4..c7790bec 100644 --- a/builtins/target-avx-common.ll +++ b/builtins/target-avx-common.ll @@ -34,6 +34,8 @@ ctlztz() define_prefetches() +define_shuffles() +aossoa() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 3d123fcf..81127897 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -34,6 +34,69 @@ include(`util.m4') stdlib_core() scans() +reduce_equal(WIDTH) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; broadcast/rotate/shuffle + +declare @__smear_float(float) nounwind readnone +declare @__smear_double(double) nounwind readnone +declare @__smear_i8(i8) nounwind readnone +declare @__smear_i16(i16) nounwind readnone +declare @__smear_i32(i32) nounwind readnone +declare @__smear_i64(i64) nounwind readnone + +declare @__broadcast_float(, i32) nounwind readnone +declare @__broadcast_double(, i32) nounwind readnone +declare @__broadcast_i8(, i32) nounwind readnone +declare @__broadcast_i16(, i32) nounwind readnone +declare @__broadcast_i32(, i32) nounwind readnone +declare @__broadcast_i64(, i32) nounwind readnone + +declare @__rotate_i8(, i32) nounwind readnone +declare @__rotate_i16(, i32) nounwind readnone +declare @__rotate_float(, i32) nounwind readnone +declare @__rotate_i32(, i32) nounwind readnone +declare @__rotate_double(, i32) nounwind readnone +declare @__rotate_i64(, i32) nounwind readnone + +declare @__shuffle_i8(, ) nounwind readnone +declare @__shuffle2_i8(, , + ) nounwind readnone +declare @__shuffle_i16(, ) nounwind readnone +declare @__shuffle2_i16(, , + ) nounwind readnone +declare @__shuffle_float(, + ) nounwind readnone +declare @__shuffle2_float(, , + ) nounwind readnone +declare @__shuffle_i32(, + ) nounwind readnone +declare @__shuffle2_i32(, , + ) nounwind readnone +declare @__shuffle_double(, + ) nounwind readnone +declare @__shuffle2_double(, + , ) nounwind readnone +declare @__shuffle_i64(, + ) nounwind readnone +declare @__shuffle2_i64(, , + ) nounwind readnone + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; aos/soa + +declare void @__soa_to_aos3_float( %v0, %v1, + %v2, float * noalias %p) nounwind +declare void @__aos_to_soa3_float(float * noalias %p, * %out0, + * %out1, * %out2) nounwind +declare void @__soa_to_aos4_float( %v0, %v1, + %v2, %v3, + float * noalias %p) nounwind +declare void @__aos_to_soa4_float(float * noalias %p, * noalias %out0, + * noalias %out1, + * noalias %out2, + * noalias %out3) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; math @@ -128,7 +191,6 @@ declare @__svml_log() declare @__svml_pow(, ) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; reductions declare i32 @__movmsk() nounwind readnone @@ -157,15 +219,6 @@ declare i64 @__reduce_add_uint64() nounwind readnone declare i64 @__reduce_min_uint64() nounwind readnone declare i64 @__reduce_max_uint64() nounwind readnone -declare i1 @__reduce_equal_int32( %v, i32 * nocapture %samevalue, - %mask) nounwind -declare i1 @__reduce_equal_float( %v, float * nocapture %samevalue, - %mask) nounwind -declare i1 @__reduce_equal_int64( %v, i64 * nocapture %samevalue, - %mask) nounwind -declare i1 @__reduce_equal_double( %v, double * nocapture %samevalue, - %mask) nounwind - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; unaligned loads/loads+broadcasts diff --git a/builtins/target-sse2-common.ll b/builtins/target-sse2-common.ll index 80c34afb..e0b7f40c 100644 --- a/builtins/target-sse2-common.ll +++ b/builtins/target-sse2-common.ll @@ -31,6 +31,8 @@ ctlztz() define_prefetches() +define_shuffles() +aossoa() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rcp diff --git a/builtins/target-sse4-common.ll b/builtins/target-sse4-common.ll index 19d31ce4..69461fcd 100644 --- a/builtins/target-sse4-common.ll +++ b/builtins/target-sse4-common.ll @@ -31,6 +31,8 @@ ctlztz() define_prefetches() +define_shuffles() +aossoa() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rounding floats diff --git a/builtins/util.m4 b/builtins/util.m4 index 15f3df11..c6d81228 100644 --- a/builtins/util.m4 +++ b/builtins/util.m4 @@ -555,7 +555,7 @@ divert`'dnl ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; define(`shuffles', ` -define @__broadcast_$2(, i32) nounwind readnone alwaysinline { +define @__broadcast_$1(, i32) nounwind readnone alwaysinline { %v = extractelement %0, i32 %1 %r_0 = insertelement undef, $1 %v, i32 0 forloop(i, 1, eval(WIDTH-1), ` %r_`'i = insertelement %r_`'eval(i-1), $1 %v, i32 i @@ -563,7 +563,7 @@ forloop(i, 1, eval(WIDTH-1), ` %r_`'i = insertelement %r_`'eval(i- ret %r_`'eval(WIDTH-1) } -define @__rotate_$2(, i32) nounwind readnone alwaysinline { +define @__rotate_$1(, i32) nounwind readnone alwaysinline { %isc = call i1 @__is_compile_time_constant_uniform_int32(i32 %1) br i1 %isc, label %is_const, label %not_const @@ -592,11 +592,11 @@ not_const: %ptr_as_elt_array = bitcast * %ptr to [eval(2*WIDTH) x $1] * %load_ptr = getelementptr [eval(2*WIDTH) x $1] * %ptr_as_elt_array, i32 0, i32 %offset %load_ptr_vec = bitcast $1 * %load_ptr to * - %result = load * %load_ptr_vec, align $3 + %result = load * %load_ptr_vec, align $2 ret %result } -define @__shuffle_$2(, ) nounwind readnone alwaysinline { +define @__shuffle_$1(, ) nounwind readnone alwaysinline { forloop(i, 0, eval(WIDTH-1), ` %index_`'i = extractelement %1, i32 i') forloop(i, 0, eval(WIDTH-1), ` @@ -608,7 +608,7 @@ forloop(i, 1, eval(WIDTH-1), ` %ret_`'i = insertelement %ret_`'eva ret %ret_`'eval(WIDTH-1) } -define @__shuffle2_$2(, , ) nounwind readnone alwaysinline { +define @__shuffle2_$1(, , ) nounwind readnone alwaysinline { %v2 = shufflevector %0, %1, < forloop(i, 0, eval(2*WIDTH-2), `i32 i, ') i32 eval(2*WIDTH-1) > @@ -650,6 +650,14 @@ forloop(i, 1, eval(WIDTH-1), ` } ') +define(`define_shuffles',` +shuffles(i8, 1) +shuffles(i16, 2) +shuffles(float, 4) +shuffles(i32, 4) +shuffles(double, 8) +shuffles(i64, 8) +') ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; global_atomic_associative @@ -923,275 +931,13 @@ define void @__prefetch_read_uniform_nt(i8 *) alwaysinline { ') ') -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - -define(`stdlib_core', ` - -declare i32 @__fast_masked_vload() - -declare i8* @ISPCAlloc(i8**, i64, i32) nounwind -declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind -declare void @ISPCSync(i8*) nounwind -declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind - -declare i1 @__is_compile_time_constant_mask( %mask) -declare i1 @__is_compile_time_constant_varying_int32() - -; This function declares placeholder masked store functions for the -; front-end to use. -; -; void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask) -; void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask) -; void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) -; void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) -; -; These in turn are converted to native masked stores or to regular -; stores (if the mask is all on) by the MaskedStoreOptPass optimization -; pass. - -declare void @__pseudo_masked_store_8( * nocapture, , ) -declare void @__pseudo_masked_store_16( * nocapture, , ) -declare void @__pseudo_masked_store_32( * nocapture, , ) -declare void @__pseudo_masked_store_64( * nocapture, , ) - -; Declare the pseudo-gather functions. When the ispc front-end needs -; to perform a gather, it generates a call to one of these functions, -; which have signatures: -; -; varying int8 __pseudo_gather(varying int8 *, mask) -; varying int16 __pseudo_gather(varying int16 *, mask) -; varying int32 __pseudo_gather(varying int32 *, mask) -; varying int64 __pseudo_gather(varying int64 *, mask) -; -; The GatherScatterFlattenOpt optimization pass finds these calls and then -; converts them to make calls to the following functions (when appropriate); -; these represent gathers from a common base pointer with offsets. The -; offset_scale factor scales the offsets before they are added to the base -; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.) -; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling -; available in x86 addressing calculations... -; -; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, -; int{32,64} offsets, int32 offset_scale, mask) -; -; Then, the GSImprovementsPass optimizations finds these and either -; converts them to native gather functions or converts them to vector -; loads, if equivalent. - -declare @__pseudo_gather32_8(, ) nounwind readonly -declare @__pseudo_gather32_16(, ) nounwind readonly -declare @__pseudo_gather32_32(, ) nounwind readonly -declare @__pseudo_gather32_64(, ) nounwind readonly - -declare @__pseudo_gather64_8(, ) nounwind readonly -declare @__pseudo_gather64_16(, ) nounwind readonly -declare @__pseudo_gather64_32(, ) nounwind readonly -declare @__pseudo_gather64_64(, ) nounwind readonly - -declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, - ) nounwind readonly - -declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, - ) nounwind readonly -declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, - ) nounwind readonly - -; Similarly to the pseudo-gathers defined above, we also declare undefined -; pseudo-scatter instructions with signatures: -; -; void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask) -; void __pseudo_scatter_16(varying int16 *, varying int16 values, mask) -; void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) -; void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) -; -; The GatherScatterFlattenOpt optimization pass also finds these and -; transforms them to scatters like: -; -; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, -; varying int32 offsets, int32 offset_scale, varying int8 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, -; varying int32 offsets, int32 offset_scale, varying int16 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, -; varying int32 offsets, int32 offset_scale, varying int32 values, mask) -; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, -; varying int32 offsets, int32 offset_scale, varying int64 values, mask) -; -; And the GSImprovementsPass in turn converts these to actual native -; scatters or masked stores. - -declare void @__pseudo_scatter32_8(, , ) nounwind -declare void @__pseudo_scatter32_16(, , ) nounwind -declare void @__pseudo_scatter32_32(, , ) nounwind -declare void @__pseudo_scatter32_64(, , ) nounwind - -declare void @__pseudo_scatter64_8(, , ) nounwind -declare void @__pseudo_scatter64_16(, , ) nounwind -declare void @__pseudo_scatter64_32(, , ) nounwind -declare void @__pseudo_scatter64_64(, , ) nounwind - -declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, - , ) nounwind - -declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, - , ) nounwind -declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, - , ) nounwind - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; vector ops - -define i8 @__extract_int8(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i8 %extract -} - -define @__insert_int8(, i32, - i8) nounwind readnone alwaysinline { - %insert = insertelement %0, i8 %2, i32 %1 - ret %insert -} - -define i16 @__extract_int16(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i16 %extract -} - -define @__insert_int16(, i32, - i16) nounwind readnone alwaysinline { - %insert = insertelement %0, i16 %2, i32 %1 - ret %insert -} - -define i32 @__extract_int32(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i32 %extract -} - -define @__insert_int32(, i32, - i32) nounwind readnone alwaysinline { - %insert = insertelement %0, i32 %2, i32 %1 - ret %insert -} - -define i64 @__extract_int64(, i32) nounwind readnone alwaysinline { - %extract = extractelement %0, i32 %1 - ret i64 %extract -} - -define @__insert_int64(, i32, - i64) nounwind readnone alwaysinline { - %insert = insertelement %0, i64 %2, i32 %1 - ret %insert -} - -shuffles(i8, int8, 1) -shuffles(i16, int16, 2) -shuffles(float, float, 4) -shuffles(i32, int32, 4) -shuffles(double, double, 8) -shuffles(i64, int64, 8) - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; various bitcasts from one type to another - -define @__intbits_varying_float() nounwind readnone alwaysinline { - %float_to_int_bitcast = bitcast %0 to - ret %float_to_int_bitcast -} - -define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { - %float_to_int_bitcast = bitcast float %0 to i32 - ret i32 %float_to_int_bitcast -} - -define @__intbits_varying_double() nounwind readnone alwaysinline { - %double_to_int_bitcast = bitcast %0 to - ret %double_to_int_bitcast -} - -define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { - %double_to_int_bitcast = bitcast double %0 to i64 - ret i64 %double_to_int_bitcast -} - -define @__floatbits_varying_int32() nounwind readnone alwaysinline { - %int_to_float_bitcast = bitcast %0 to - ret %int_to_float_bitcast -} - -define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { - %int_to_float_bitcast = bitcast i32 %0 to float - ret float %int_to_float_bitcast -} - -define @__doublebits_varying_int64() nounwind readnone alwaysinline { - %int_to_double_bitcast = bitcast %0 to - ret %int_to_double_bitcast -} - -define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { - %int_to_double_bitcast = bitcast i64 %0 to double - ret double %int_to_double_bitcast -} - -define @__undef_varying() nounwind readnone alwaysinline { - ret undef -} - -define float @__undef_uniform() nounwind readnone alwaysinline { - ret float undef -} - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; sign extension - -define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { - %r = sext i1 %0 to i32 - ret i32 %r -} - -define @__sext_varying_bool() nounwind readnone alwaysinline { - ifelse(MASK,i1, ` - %se = sext %0 to - ret %se - ', ` - ret %0') -} - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; AOS/SOA conversion primitives ;; take 4 4-wide vectors laid out like ... ;; and reorder them to ... -define void +define(`aossoa', `define void @__aos_to_soa4_float4(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> * noalias %out0, <4 x float> * noalias %out1, <4 x float> * noalias %out2, @@ -1750,23 +1496,6 @@ define void } -define void -@__aos_to_soa4_int32(i32 * noalias %ptr, - * noalias %out0, * noalias %out1, - * noalias %out2, * noalias %out3) - nounwind alwaysinline { - %fptr = bitcast i32 * %ptr to float * - %fout0 = bitcast * %out0 to * - %fout1 = bitcast * %out1 to * - %fout2 = bitcast * %out2 to * - %fout3 = bitcast * %out3 to * - call void @__aos_to_soa4_float(float * %fptr, - * %fout0, * %fout1, * %fout2, - * %fout3) - ret void -} - - define void @__soa_to_aos4_float( %v0, %v1, %v2, %v3, float * noalias %p) nounwind alwaysinline { @@ -1781,20 +1510,6 @@ define void } -define void -@__soa_to_aos4_int32( %v0, %v1, %v2, - %v3, i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast %v0 to - %fv1 = bitcast %v1 to - %fv2 = bitcast %v2 to - %fv3 = bitcast %v3 to - %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos4_float( %fv0, %fv1, - %fv2, %fv3, float * %fbase) - ret void -} - - define void @__aos_to_soa3_float(float * noalias %p, * %out0, * %out1, @@ -1812,20 +1527,6 @@ define void } -define void -@__aos_to_soa3_int32(i32 * noalias %base, - * noalias %out0, * noalias %out1, - * noalias %out2) nounwind alwaysinline { - %fbase = bitcast i32 * %base to float * - %fout0 = bitcast * %out0 to * - %fout1 = bitcast * %out1 to * - %fout2 = bitcast * %out2 to * - call void @__aos_to_soa3_float(float * %fbase, - * %fout0, * %fout1, * %fout2) - ret void -} - - define void @__soa_to_aos3_float( %v0, %v1, %v2, float * noalias %p) nounwind alwaysinline { @@ -1837,20 +1538,262 @@ define void * %out2) ret void } +') + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -define void -@__soa_to_aos3_int32( %v0, %v1, %v2, - i32 * noalias %base) nounwind alwaysinline { - %fv0 = bitcast %v0 to - %fv1 = bitcast %v1 to - %fv2 = bitcast %v2 to - %fbase = bitcast i32 * %base to float * - call void @__soa_to_aos3_float( %fv0, %fv1, - %fv2, float * %fbase) - ret void +define(`stdlib_core', ` + +declare i32 @__fast_masked_vload() + +declare i8* @ISPCAlloc(i8**, i64, i32) nounwind +declare void @ISPCLaunch(i8**, i8*, i8*, i32) nounwind +declare void @ISPCSync(i8*) nounwind +declare void @ISPCInstrument(i8*, i8*, i32, i32) nounwind + +declare i1 @__is_compile_time_constant_mask( %mask) +declare i1 @__is_compile_time_constant_varying_int32() + +; This function declares placeholder masked store functions for the +; front-end to use. +; +; void __pseudo_masked_store_8 (uniform int8 *ptr, varying int8 values, mask) +; void __pseudo_masked_store_16(uniform int16 *ptr, varying int16 values, mask) +; void __pseudo_masked_store_32(uniform int32 *ptr, varying int32 values, mask) +; void __pseudo_masked_store_64(uniform int64 *ptr, varying int64 values, mask) +; +; These in turn are converted to native masked stores or to regular +; stores (if the mask is all on) by the MaskedStoreOptPass optimization +; pass. + +declare void @__pseudo_masked_store_8( * nocapture, , ) +declare void @__pseudo_masked_store_16( * nocapture, , ) +declare void @__pseudo_masked_store_32( * nocapture, , ) +declare void @__pseudo_masked_store_64( * nocapture, , ) + +; Declare the pseudo-gather functions. When the ispc front-end needs +; to perform a gather, it generates a call to one of these functions, +; which have signatures: +; +; varying int8 __pseudo_gather(varying int8 *, mask) +; varying int16 __pseudo_gather(varying int16 *, mask) +; varying int32 __pseudo_gather(varying int32 *, mask) +; varying int64 __pseudo_gather(varying int64 *, mask) +; +; The GatherScatterFlattenOpt optimization pass finds these calls and then +; converts them to make calls to the following functions (when appropriate); +; these represent gathers from a common base pointer with offsets. The +; offset_scale factor scales the offsets before they are added to the base +; pointer--it should have the value 1, 2, 4, or 8. (It can always just be 1.) +; The 2, 4, 8 cases are used to match LLVM patterns that use the free 2/4/8 scaling +; available in x86 addressing calculations... +; +; varying int8 __pseudo_gather_base_offsets{32,64}_8(uniform int8 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int16 __pseudo_gather_base_offsets{32,64}_16(uniform int16 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int32 __pseudo_gather_base_offsets{32,64}_32(uniform int32 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; varying int64 __pseudo_gather_base_offsets{32,64}_64(uniform int64 *base, +; int{32,64} offsets, int32 offset_scale, mask) +; +; Then, the GSImprovementsPass optimizations finds these and either +; converts them to native gather functions or converts them to vector +; loads, if equivalent. + +declare @__pseudo_gather32_8(, ) nounwind readonly +declare @__pseudo_gather32_16(, ) nounwind readonly +declare @__pseudo_gather32_32(, ) nounwind readonly +declare @__pseudo_gather32_64(, ) nounwind readonly + +declare @__pseudo_gather64_8(, ) nounwind readonly +declare @__pseudo_gather64_16(, ) nounwind readonly +declare @__pseudo_gather64_32(, ) nounwind readonly +declare @__pseudo_gather64_64(, ) nounwind readonly + +declare @__pseudo_gather_base_offsets32_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets32_64(i8 *, , i32, + ) nounwind readonly + +declare @__pseudo_gather_base_offsets64_8(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_16(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_32(i8 *, , i32, + ) nounwind readonly +declare @__pseudo_gather_base_offsets64_64(i8 *, , i32, + ) nounwind readonly + +; Similarly to the pseudo-gathers defined above, we also declare undefined +; pseudo-scatter instructions with signatures: +; +; void __pseudo_scatter_8 (varying int8 *, varying int8 values, mask) +; void __pseudo_scatter_16(varying int16 *, varying int16 values, mask) +; void __pseudo_scatter_32(varying int32 *, varying int32 values, mask) +; void __pseudo_scatter_64(varying int64 *, varying int64 values, mask) +; +; The GatherScatterFlattenOpt optimization pass also finds these and +; transforms them to scatters like: +; +; void __pseudo_scatter_base_offsets{32,64}_8(uniform int8 *base, +; varying int32 offsets, int32 offset_scale, varying int8 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_16(uniform int16 *base, +; varying int32 offsets, int32 offset_scale, varying int16 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_32(uniform int32 *base, +; varying int32 offsets, int32 offset_scale, varying int32 values, mask) +; void __pseudo_scatter_base_offsets{32,64}_64(uniform int64 *base, +; varying int32 offsets, int32 offset_scale, varying int64 values, mask) +; +; And the GSImprovementsPass in turn converts these to actual native +; scatters or masked stores. + +declare void @__pseudo_scatter32_8(, , ) nounwind +declare void @__pseudo_scatter32_16(, , ) nounwind +declare void @__pseudo_scatter32_32(, , ) nounwind +declare void @__pseudo_scatter32_64(, , ) nounwind + +declare void @__pseudo_scatter64_8(, , ) nounwind +declare void @__pseudo_scatter64_16(, , ) nounwind +declare void @__pseudo_scatter64_32(, , ) nounwind +declare void @__pseudo_scatter64_64(, , ) nounwind + +declare void @__pseudo_scatter_base_offsets32_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets32_64(i8 * nocapture, , i32, + , ) nounwind + +declare void @__pseudo_scatter_base_offsets64_8(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_16(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_32(i8 * nocapture, , i32, + , ) nounwind +declare void @__pseudo_scatter_base_offsets64_64(i8 * nocapture, , i32, + , ) nounwind + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; vector ops + +define i8 @__extract_int8(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 + ret i8 %extract } +define @__insert_int8(, i32, + i8) nounwind readnone alwaysinline { + %insert = insertelement %0, i8 %2, i32 %1 + ret %insert +} + +define i16 @__extract_int16(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 + ret i16 %extract +} + +define @__insert_int16(, i32, + i16) nounwind readnone alwaysinline { + %insert = insertelement %0, i16 %2, i32 %1 + ret %insert +} + +define i32 @__extract_int32(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 + ret i32 %extract +} + +define @__insert_int32(, i32, + i32) nounwind readnone alwaysinline { + %insert = insertelement %0, i32 %2, i32 %1 + ret %insert +} + +define i64 @__extract_int64(, i32) nounwind readnone alwaysinline { + %extract = extractelement %0, i32 %1 + ret i64 %extract +} + +define @__insert_int64(, i32, + i64) nounwind readnone alwaysinline { + %insert = insertelement %0, i64 %2, i32 %1 + ret %insert +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; various bitcasts from one type to another + +define @__intbits_varying_float() nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast %0 to + ret %float_to_int_bitcast +} + +define i32 @__intbits_uniform_float(float) nounwind readnone alwaysinline { + %float_to_int_bitcast = bitcast float %0 to i32 + ret i32 %float_to_int_bitcast +} + +define @__intbits_varying_double() nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast %0 to + ret %double_to_int_bitcast +} + +define i64 @__intbits_uniform_double(double) nounwind readnone alwaysinline { + %double_to_int_bitcast = bitcast double %0 to i64 + ret i64 %double_to_int_bitcast +} + +define @__floatbits_varying_int32() nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast %0 to + ret %int_to_float_bitcast +} + +define float @__floatbits_uniform_int32(i32) nounwind readnone alwaysinline { + %int_to_float_bitcast = bitcast i32 %0 to float + ret float %int_to_float_bitcast +} + +define @__doublebits_varying_int64() nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast %0 to + ret %int_to_double_bitcast +} + +define double @__doublebits_uniform_int64(i64) nounwind readnone alwaysinline { + %int_to_double_bitcast = bitcast i64 %0 to double + ret double %int_to_double_bitcast +} + +define @__undef_varying() nounwind readnone alwaysinline { + ret undef +} + +define float @__undef_uniform() nounwind readnone alwaysinline { + ret float undef +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; sign extension + +define i32 @__sext_uniform_bool(i1) nounwind readnone alwaysinline { + %r = sext i1 %0 to i32 + ret i32 %r +} + +define @__sext_varying_bool() nounwind readnone alwaysinline { + ifelse(MASK,i1, ` + %se = sext %0 to + ret %se + ', ` + ret %0') +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; assert @@ -2570,7 +2513,7 @@ check_neighbors: ; up comparing each element to its neighbor on the right. Then see if ; all of those values are true; if so, then all of the elements are equal.. %castvec = bitcast <$1 x $2> %vec to <$1 x $4> - %castvr = call <$1 x $4> @__rotate_int$6(<$1 x $4> %castvec, i32 1) + %castvr = call <$1 x $4> @__rotate_i$6(<$1 x $4> %castvec, i32 1) %vr = bitcast <$1 x $4> %castvr to <$1 x $2> %eq = $5 eq <$1 x $2> %vec, %vr ifelse(MASK,i32, ` diff --git a/stdlib.ispc b/stdlib.ispc index 667c2e0e..532eb49b 100644 --- a/stdlib.ispc +++ b/stdlib.ispc @@ -94,15 +94,15 @@ static inline float broadcast(float v, uniform int i) { } static inline int8 broadcast(int8 v, uniform int i) { - return __broadcast_int8(v, i); + return __broadcast_i8(v, i); } static inline int16 broadcast(int16 v, uniform int i) { - return __broadcast_int16(v, i); + return __broadcast_i16(v, i); } static inline int32 broadcast(int32 v, uniform int i) { - return __broadcast_int32(v, i); + return __broadcast_i32(v, i); } static inline double broadcast(double v, uniform int i) { @@ -110,7 +110,7 @@ static inline double broadcast(double v, uniform int i) { } static inline int64 broadcast(int64 v, uniform int i) { - return __broadcast_int64(v, i); + return __broadcast_i64(v, i); } static inline float rotate(float v, uniform int i) { @@ -118,15 +118,15 @@ static inline float rotate(float v, uniform int i) { } static inline int8 rotate(int8 v, uniform int i) { - return __rotate_int8(v, i); + return __rotate_i8(v, i); } static inline int16 rotate(int16 v, uniform int i) { - return __rotate_int16(v, i); + return __rotate_i16(v, i); } static inline int32 rotate(int32 v, uniform int i) { - return __rotate_int32(v, i); + return __rotate_i32(v, i); } static inline double rotate(double v, uniform int i) { @@ -134,7 +134,7 @@ static inline double rotate(double v, uniform int i) { } static inline int64 rotate(int64 v, uniform int i) { - return __rotate_int64(v, i); + return __rotate_i64(v, i); } static inline float shuffle(float v, int i) { @@ -142,15 +142,15 @@ static inline float shuffle(float v, int i) { } static inline int8 shuffle(int8 v, int i) { - return __shuffle_int8(v, i); + return __shuffle_i8(v, i); } static inline int16 shuffle(int16 v, int i) { - return __shuffle_int16(v, i); + return __shuffle_i16(v, i); } static inline int32 shuffle(int32 v, int i) { - return __shuffle_int32(v, i); + return __shuffle_i32(v, i); } static inline double shuffle(double v, int i) { @@ -158,7 +158,7 @@ static inline double shuffle(double v, int i) { } static inline int64 shuffle(int64 v, int i) { - return __shuffle_int64(v, i); + return __shuffle_i64(v, i); } static inline float shuffle(float v0, float v1, int i) { @@ -166,15 +166,15 @@ static inline float shuffle(float v0, float v1, int i) { } static inline int8 shuffle(int8 v0, int8 v1, int i) { - return __shuffle2_int8(v0, v1, i); + return __shuffle2_i8(v0, v1, i); } static inline int16 shuffle(int16 v0, int16 v1, int i) { - return __shuffle2_int16(v0, v1, i); + return __shuffle2_i16(v0, v1, i); } static inline int32 shuffle(int32 v0, int32 v1, int i) { - return __shuffle2_int32(v0, v1, i); + return __shuffle2_i32(v0, v1, i); } static inline double shuffle(double v0, double v1, int i) { @@ -182,7 +182,7 @@ static inline double shuffle(double v0, double v1, int i) { } static inline int64 shuffle(int64 v0, int64 v1, int i) { - return __shuffle2_int64(v0, v1, i); + return __shuffle2_i64(v0, v1, i); } // x[i] @@ -470,23 +470,27 @@ soa_to_aos4(float v0, float v1, float v2, float v3, uniform float a[]) { static inline void aos_to_soa3(uniform int32 a[], int32 * uniform v0, int32 * uniform v1, int32 * uniform v2) { - __aos_to_soa3_int32(a, v0, v1, v2); + aos_to_soa3((uniform float * uniform)a, (float * uniform)v0, + (float * uniform)v1, (float * uniform)v2); } static inline void soa_to_aos3(int32 v0, int32 v1, int32 v2, uniform int32 a[]) { - __soa_to_aos3_int32(v0, v1, v2, a); + soa_to_aos3(floatbits(v0), floatbits(v1), floatbits(v2), + (uniform float * uniform)a); } static inline void aos_to_soa4(uniform int32 a[], int32 * uniform v0, int32 * uniform v1, int32 * uniform v2, int32 * uniform v3) { - __aos_to_soa4_int32(a, v0, v1, v2, v3); + aos_to_soa4((uniform float * uniform)a, (float * uniform )v0, + (float * uniform)v1, (float * uniform)v2, (float * uniform)v3); } static inline void soa_to_aos4(int32 v0, int32 v1, int32 v2, int32 v3, uniform int32 a[]) { - __soa_to_aos4_int32(v0, v1, v2, v3, a); + soa_to_aos4(floatbits(v0), floatbits(v1), floatbits(v2), floatbits(v3), + (uniform float * uniform)a); } ///////////////////////////////////////////////////////////////////////////