From 15816eb07e6a8701fc27b078e411d191be972602 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Dec 2013 14:13:55 +0400 Subject: [PATCH] adding __packed_store_active2 to generic targets --- builtins/target-generic-1.ll | 3 ++- builtins/target-generic-common.ll | 2 ++ examples/intrinsics/generic-16.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-32.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-64.h | 39 ++++++++++++++++++------------- examples/intrinsics/knc-i1x16.h | 20 +++++++++------- examples/intrinsics/knc-i1x8.h | 19 ++++++++------- examples/intrinsics/knc.h | 5 ++++ examples/intrinsics/sse4.h | 24 +++++++++++++++++++ 9 files changed, 125 insertions(+), 65 deletions(-) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 910565dd..c43a12a7 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -3,6 +3,7 @@ define(`MASK',`i32') define(`WIDTH',`1') include(`util.m4') +rdrand_decls() ; Define some basics for a 1-wide target stdlib_core() packed_load_and_store() @@ -655,7 +656,7 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline -declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 92b7a18e..2b2b21c9 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, * nocapture, ) nounwind declare i32 @__packed_store_active(i32 * nocapture, %vals, ) nounwind +declare i32 @__packed_store_active2(i32 * nocapture, %vals, + ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index fa794276..0aa8a3f6 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 16; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec16_i32 *val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec16_i32 val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 531ed215..924b049d 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 32; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec32_i32 *val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec32_i32 val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec32_i32 val, + __vec32_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index bbeb007a..b1451c96 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val, + __vec64_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 64; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec64_i32 *val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *) ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec64_i32 val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *) ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec64_i32 val, + __vec64_i1 mask) { + return __packed_store_active2((int32_t *) ptr, val, mask); } diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ef14d26e..141c47bb 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _ return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); + return __packed_store_active((uint32_t *)p, val, mask); +} + +static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index d7696117..32f39c4a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, __vec8_i1 mask) { - __vec8_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_store_active((uint32_t *)p, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} + #endif /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 8baef8cb..b0782b6e 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1878,6 +1878,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // prefetch /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 919716be..5dd424d9 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val, return count; } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + int count = 0; + + ptr[count] = _mm_extract_epi32(val.v, 0); + count -= _mm_extract_ps(mask.v, 0); + + ptr[count] = _mm_extract_epi32(val.v, 1); + count -= _mm_extract_ps(mask.v, 1); + + ptr[count] = _mm_extract_epi32(val.v, 2); + count -= _mm_extract_ps(mask.v, 2); + + ptr[count] = _mm_extract_epi32(val.v, 3); + count -= _mm_extract_ps(mask.v, 3); + + return count; +} + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val, __vec4_i1 mask) { return __packed_load_active((int32_t *)ptr, val, mask); @@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val, return __packed_store_active((int32_t *)ptr, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // aos/soa