From 5d1cda986947fa276745b63ac9e3dfc5c5dfb1ce Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 13:05:46 +0400 Subject: [PATCH 01/14] Bumping LLVM 3.4 from rc2 to rc3 in alloy.py --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index 0055842c..01ce4453 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc2" + SVN_PATH="tags/RELEASE_34/rc3" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" From bdeaf7e88cce683ea97ed53e44d446ca18fed233 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 16:53:21 +0400 Subject: [PATCH 02/14] Documentation update for overloaded operators and packed_store_active2() --- docs/ispc.rst | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 7e76f433..22e7637b 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -48,6 +48,8 @@ Contents: + `Updating ISPC Programs For Changes In ISPC 1.1`_ + `Updating ISPC Programs For Changes In ISPC 1.2`_ + `Updating ISPC Programs For Changes In ISPC 1.3`_ + + `Updating ISPC Programs For Changes In ISPC 1.5.0`_ + + `Updating ISPC Programs For Changes In ISPC 1.5.1`_ * `Getting Started with ISPC`_ @@ -97,6 +99,9 @@ Contents: * `Short Vector Types`_ * `Array Types`_ * `Struct Types`_ + + + `Operators Overloading`_ + * `Structure of Array Types`_ + `Declarations and Initializers`_ @@ -279,6 +284,15 @@ Double precision floating point constants are floating point number with 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is treated as single precision constant. +Updating ISPC Programs For Changes In ISPC 1.5.1 +------------------------------------------------ + +This release adds support for `Operators Overloading`_, so a word ``operator`` +becomes a keyword and it potentially creates a conflict with existing user +function. Also a new library function packed_store_active2() was introduced, +which also may create a conflict with existing user functions. + + Getting Started with ISPC ========================= @@ -2122,7 +2136,35 @@ above code, the value of ``f[index]`` needs to be able to store a different value of ``Foo::a`` for each program instance. However, a ``varying Foo`` still has only a single ``a`` member, since ``a`` was declared with ``uniform`` variability in the declaration of ``Foo``. Therefore, the -indexing operation in the last line results in an error. +indexing operation in the last line results in an error. + + +Operators Overloading +--------------------- + +ISPC has limited support for overloaded operators for ``struct`` types. Only +binary operators are supported currently, namely they are: ``*, /, %, +, -, >> +and <<``. Operators overloading support is similar to the one in C++ language. +To overload an operator for ``struct S``, you need to declare and implement a +function using keyword ``operator``, which accepts two parameters of type +``struct S`` or ``struct S&`` and returns either of these types. For example: + +:: + + struct S { float re, im;}; + struct S operator*(struct S a, struct S b) { + struct S result; + result.re = a.re * b.re - a.im * b.im; + result.im = a.re * b.im + a.im * b.re; + return result; + } + + void foo(struct S a, struct S b) { + struct S mul = a*b; + print("a.re: %\na.im: %\n", a.re, a.im); + print("b.re: %\nb.im: %\n", b.re, b.im); + print("mul.re: %\nmul.im: %\n", mul.re, mul.im); + } Structure of Array Types @@ -4050,6 +4092,14 @@ They return the total number of values stored. unsigned int val) +There are also ``packed_store_active2()`` functions with exactly the same +signatures and the same semantic except that they may write one extra +element to the output array (but still returning the same value as +``packed_store_active()``). These functions suggest different branch free +implementation on most of supported targets, which usuarly (but not always) +performs better than ``packed_store_active()``. It's advised to test function +performance on user's scenarios on particular target hardware before using it. + As an example of how these functions can be used, the following code shows the use of ``packed_store_active()``. From ca6b3dfa1c513063807133a9ad19f737aeeb6ced Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 16:53:41 +0400 Subject: [PATCH 03/14] Vim syntax support for operators --- contrib/ispc.vim | 2 ++ 1 file changed, 2 insertions(+) diff --git a/contrib/ispc.vim b/contrib/ispc.vim index f3cb413b..11808658 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -18,6 +18,7 @@ syn keyword ispcConditional cif syn keyword ispcRepeat cdo cfor cwhile syn keyword ispcBuiltin programCount programIndex syn keyword ispcType export uniform varying int8 int16 int32 int64 +syn keyword ispcOperator operator "double precision floating point number, with dot, optional exponent syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" @@ -33,6 +34,7 @@ HiLink ispcConditional Conditional HiLink ispcRepeat Repeat HiLink ispcBuiltin Statement HiLink ispcType Type +HiLink ispcOperator Operator delcommand HiLink let b:current_syntax = "ispc" From 15816eb07e6a8701fc27b078e411d191be972602 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Dec 2013 14:13:55 +0400 Subject: [PATCH 04/14] adding __packed_store_active2 to generic targets --- builtins/target-generic-1.ll | 3 ++- builtins/target-generic-common.ll | 2 ++ examples/intrinsics/generic-16.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-32.h | 39 ++++++++++++++++++------------- examples/intrinsics/generic-64.h | 39 ++++++++++++++++++------------- examples/intrinsics/knc-i1x16.h | 20 +++++++++------- examples/intrinsics/knc-i1x8.h | 19 ++++++++------- examples/intrinsics/knc.h | 5 ++++ examples/intrinsics/sse4.h | 24 +++++++++++++++++++ 9 files changed, 125 insertions(+), 65 deletions(-) diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 910565dd..c43a12a7 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -3,6 +3,7 @@ define(`MASK',`i32') define(`WIDTH',`1') include(`util.m4') +rdrand_decls() ; Define some basics for a 1-wide target stdlib_core() packed_load_and_store() @@ -655,7 +656,7 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline -declare void @__svml_sincosd(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index 92b7a18e..2b2b21c9 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, * nocapture, ) nounwind declare i32 @__packed_store_active(i32 * nocapture, %vals, ) nounwind +declare i32 @__packed_store_active2(i32 * nocapture, %vals, + ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index fa794276..0aa8a3f6 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 16; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec16_i32 *val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec16_i32 val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 531ed215..924b049d 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 32; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec32_i32 *val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec32_i32 val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec32_i32 val, + __vec32_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index bbeb007a..b1451c96 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val, + __vec64_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 64; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec64_i32 *val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *) ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec64_i32 val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *) ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec64_i32 val, + __vec64_i1 mask) { + return __packed_store_active2((int32_t *) ptr, val, mask); } diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ef14d26e..141c47bb 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _ return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); + return __packed_store_active((uint32_t *)p, val, mask); +} + +static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index d7696117..32f39c4a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, __vec8_i1 mask) { - __vec8_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_store_active((uint32_t *)p, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} + #endif /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 8baef8cb..b0782b6e 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1878,6 +1878,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // prefetch /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 919716be..5dd424d9 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val, return count; } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + int count = 0; + + ptr[count] = _mm_extract_epi32(val.v, 0); + count -= _mm_extract_ps(mask.v, 0); + + ptr[count] = _mm_extract_epi32(val.v, 1); + count -= _mm_extract_ps(mask.v, 1); + + ptr[count] = _mm_extract_epi32(val.v, 2); + count -= _mm_extract_ps(mask.v, 2); + + ptr[count] = _mm_extract_epi32(val.v, 3); + count -= _mm_extract_ps(mask.v, 3); + + return count; +} + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val, __vec4_i1 mask) { return __packed_load_active((int32_t *)ptr, val, mask); @@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val, return __packed_store_active((int32_t *)ptr, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // aos/soa From f802164ccedbb3108affce5494893dbe12133407 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 18:18:30 +0400 Subject: [PATCH 05/14] Fixing some typos in docs and adding operators to language description --- docs/ispc.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/ispc.rst b/docs/ispc.rst index 22e7637b..9464dcde 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -49,7 +49,7 @@ Contents: + `Updating ISPC Programs For Changes In ISPC 1.2`_ + `Updating ISPC Programs For Changes In ISPC 1.3`_ + `Updating ISPC Programs For Changes In ISPC 1.5.0`_ - + `Updating ISPC Programs For Changes In ISPC 1.5.1`_ + + `Updating ISPC Programs For Changes In ISPC 1.6.0`_ * `Getting Started with ISPC`_ @@ -284,7 +284,7 @@ Double precision floating point constants are floating point number with 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is treated as single precision constant. -Updating ISPC Programs For Changes In ISPC 1.5.1 +Updating ISPC Programs For Changes In ISPC 1.6.0 ------------------------------------------------ This release adds support for `Operators Overloading`_, so a word ``operator`` @@ -1339,6 +1339,7 @@ in C: * Function overloading by parameter type * Hexadecimal floating-point constants * Dynamic memory allocation with ``new`` and ``delete``. +* Limited support for overloaded operators (`Operators Overloading`_). ``ispc`` also adds a number of new features that aren't in C89, C99, or C++: @@ -4096,7 +4097,7 @@ There are also ``packed_store_active2()`` functions with exactly the same signatures and the same semantic except that they may write one extra element to the output array (but still returning the same value as ``packed_store_active()``). These functions suggest different branch free -implementation on most of supported targets, which usuarly (but not always) +implementation on most of supported targets, which usually (but not always) performs better than ``packed_store_active()``. It's advised to test function performance on user's scenarios on particular target hardware before using it. From 5d51f8c7a7380a7ea868dcd4bd31c6a7a3bd44ec Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:05:47 +0400 Subject: [PATCH 06/14] Adding release notes for 1.6.0 --- docs/ReleaseNotes.txt | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index a8575ea0..b7d0bb17 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,47 @@ +=== v1.6.0 === (19 December 2013) + +A major new version of ISPC with major improvements in performance and +stability. Linux and MacOS binaries are based on patched version of LLVM 3.3, +while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves +stability on Win32 platform, so we've decided not to wait for official LLVM 3.4 +release. + +The list of the most significant changes is: + +* New avx1-i32x4 target was added. It may play well for you, if you are focused + on integer computations or FP unit in your hardware is 128 bit wide. + +* Support for calculations in double precision was extended with two new + targets avx1.1-i64x4 and avx2-i64x4. + +* Language support for overloaded operators was added. + +* New library shift() function was added, which is similar to rotate(), but is + non-circular. + +* The language was extended to accept 3 dimensional tasking - a syntactic sugar, + which may facilitate programming of some tasks. + +* Regression, which broke --opt=force-aligned-memory is fixed. + +If you are not using pre-built binaries, you may notice the following changes: + +* VS2012/VS2013 are supported. + +* alloy.py (with -b switch) can build LLVM for you on any platform now + (except MacOS 10.9, but we know about the problem and working on it). + This is a preferred way to build LLVM for ISPC, as all required patches for + better performance and stability will automatically apply. + +* LLVM 3.5 (current trunk) is supported. + +There are also multiple fixes for better performance and stability, most +notable are: + +* Fixed performance problem for x2 targets. + +* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32. + === v1.5.0 === (27 September 2013) A major new version of ISPC with several new targets and important bug fixes. From 7bf64bc4900564c3098b7acd5a90a170d35da626 Mon Sep 17 00:00:00 2001 From: Ilia Filippov Date: Thu, 19 Dec 2013 17:57:29 +0400 Subject: [PATCH 07/14] changes in examples (windows) --- examples/aobench/aobench.vcxproj | 18 ++++++++++++ examples/common.props | 28 +++++++++---------- examples/deferred/deferred_shading.vcxproj | 18 ++++++++++++ examples/mandelbrot/mandelbrot.vcxproj | 18 ++++++++++++ .../mandelbrot_tasks/mandelbrot_tasks.vcxproj | 18 ++++++++++++ examples/noise/noise.vcxproj | 18 ++++++++++++ examples/options/options.vcxproj | 18 ++++++++++++ examples/rt/rt.vcxproj | 18 ++++++++++++ examples/sort/sort.vcxproj | 18 ++++++++++++ examples/stencil/stencil.vcxproj | 18 ++++++++++++ examples/tasksys.cpp | 9 +++++- examples/volume_rendering/volume.vcxproj | 18 ++++++++++++ 12 files changed, 202 insertions(+), 15 deletions(-) diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj index c46ee41a..298be2cb 100644 --- a/examples/aobench/aobench.vcxproj +++ b/examples/aobench/aobench.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} Win32Proj diff --git a/examples/common.props b/examples/common.props index 7bf37005..3769330b 100644 --- a/examples/common.props +++ b/examples/common.props @@ -146,24 +146,24 @@ ispc $(default_targets) - $(TargetDir)$(ISPC_file).obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj + $(ISPC_file).obj + $(Target_out);$(ISPC_file)_sse2.obj + $(Target_out);$(ISPC_file)_sse4.obj + $(Target_out);$(ISPC_file)_avx.obj + $(Target_out);$(ISPC_file)_avx11.obj + $(Target_out);$(ISPC_file)_avx2.obj Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj index cd361b26..974e870b 100755 --- a/examples/deferred/deferred_shading.vcxproj +++ b/examples/deferred/deferred_shading.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {87f53c53-957e-4e91-878a-bc27828fb9eb} Win32Proj diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj index e7703ad0..7a5f6e03 100644 --- a/examples/mandelbrot/mandelbrot.vcxproj +++ b/examples/mandelbrot/mandelbrot.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1} Win32Proj diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index f8b8cfcb..113fc4e8 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj index 7adc57f3..ff3953ae 100644 --- a/examples/noise/noise.vcxproj +++ b/examples/noise/noise.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD} Win32Proj diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj index af336aa1..d48ac8bc 100644 --- a/examples/options/options.vcxproj +++ b/examples/options/options.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE} Win32Proj diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj index ea34de56..00b6dd3a 100644 --- a/examples/rt/rt.vcxproj +++ b/examples/rt/rt.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9} Win32Proj diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj index 43f2b439..b0bdc63d 100644 --- a/examples/sort/sort.vcxproj +++ b/examples/sort/sort.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2} Win32Proj diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj index b5f5bb22..fd8564aa 100644 --- a/examples/stencil/stencil.vcxproj +++ b/examples/stencil/stencil.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {2ef070a1-f62f-4e6a-944b-88d140945c3c} Win32Proj diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index cfe0b17b..77269f9f 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -175,6 +175,9 @@ typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task +#ifdef _MSC_VER +__declspec(align(16)) +#endif struct TaskInfo { TaskFuncType func; void *data; @@ -200,7 +203,11 @@ struct TaskInfo { int taskCount1() const { return taskCount3d[1]; } int taskCount2() const { return taskCount3d[2]; } TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } -} __attribute__((aligned(32))); +} +#ifndef _MSC_VER +__attribute__((aligned(32))); +#endif +; // ispc expects these functions to have C linkage / not be mangled extern "C" { diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj index cc738a7e..a1fea5f1 100644 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {dee5733a-e93e-449d-9114-9bffcaeb4df9} Win32Proj From f936269a1e3898c0436b1589a093b956a26222af Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:14:22 +0400 Subject: [PATCH 08/14] News update for 1.6.0 --- docs/news.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/news.rst b/docs/news.rst index 7d78a662..6a805e48 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -2,6 +2,16 @@ ispc News ========= +ispc 1.6.0 is Released +---------------------- + +A major update of ``ispc`` has been released. The main focus is on improved +performance and stability. Several new targets were added. There are also +a number of language and library extensions. Released binaries are based on +patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer +to Release Notes for complete set of changes. + + ispc 1.5.0 is Released ---------------------- From 040605a83c3b92b0a3016c5185d86a37c0d0b35b Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 21:17:42 +0400 Subject: [PATCH 09/14] Bumping up ispc version to 1.6.0 --- doxygen.cfg | 2 +- ispc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doxygen.cfg b/doxygen.cfg index a0ad3176..a1a0b91b 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.5.1dev +PROJECT_NUMBER = 1.6.0 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index 4b333861..b9b7db17 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.5.1dev" +#define ISPC_VERSION "1.6.0" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported" From 799e476b484d090263260a3202a3473ce85a21c7 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Thu, 19 Dec 2013 22:29:02 +0400 Subject: [PATCH 10/14] Bumping ISPC version to 1.6.1dev --- doxygen.cfg | 2 +- ispc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doxygen.cfg b/doxygen.cfg index a1a0b91b..9a8f88e5 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.6.0 +PROJECT_NUMBER = 1.6.1dev # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/ispc.h b/ispc.h index b9b7db17..88eb8353 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.6.0" +#define ISPC_VERSION "1.6.1dev" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported" From 9f933b500b59192194e212333f416e90596742d4 Mon Sep 17 00:00:00 2001 From: "james.brodman" Date: Fri, 20 Dec 2013 16:45:27 -0500 Subject: [PATCH 11/14] Add missing __cast_sext(__vec16_i32,__vec16_i1) --- examples/intrinsics/knc.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index b0782b6e..0077ad88 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1260,6 +1260,13 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3 return __vec16_i64(val.v, _mm512_setzero_epi32()); } +static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); +} + static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) { __vec16_i32 ret = _mm512_setzero_epi32(); From 949984db185a832acafad2565326c2bdf0def4de Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Mon, 23 Dec 2013 16:31:33 +0400 Subject: [PATCH 12/14] Don't do sext+and optimization for generic targets --- opt.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/opt.cpp b/opt.cpp index c75d4225..ff7ee410 100644 --- a/opt.cpp +++ b/opt.cpp @@ -5153,6 +5153,11 @@ FixBooleanSelectPass::runOnFunction(llvm::Function &F) { // LLVM 3.3 only #if defined(LLVM_3_3) + // Don't optimize generic targets. + if (g->target->getISA() == Target::GENERIC) { + return false; + } + for (llvm::Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { llvm::BasicBlock* bb = &*I; From 34a588511f53517d9ee9a559f1fef492f2c378f5 Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 24 Dec 2013 18:38:25 +0400 Subject: [PATCH 13/14] Checkout and install with clang standard library headers on MacOS 10.9 --- alloy.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/alloy.py b/alloy.py index 01ce4453..ee7a1acf 100755 --- a/alloy.py +++ b/alloy.py @@ -129,8 +129,23 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ", "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang", from_validation) + os.chdir("..") + if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13: + # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and + # the default library is libc++, bit libstdc++. The headers are part of XCode now. But we are checking out + # headers as part of LLVM source tree, so they will be installed in clang location and clang will be able + # to find them. Though they may not match to the library installed in the system, but seems that this should + # not happen. + # Note, that we can also build a libc++ library, but it must be on system default location or should be passed + # to the linker explicitly (either through command line or environment variables). So we are not doing it + # currently to make the build process easier. + os.chdir("projects") + try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx", + from_validation) + os.chdir("..") if extra == True: - os.chdir("./clang/tools") + os.chdir("tools/clang/tools") try_do_LLVM("load extra clang extra tools ", "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra", from_validation) @@ -138,7 +153,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load extra clang compiler-rt ", "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt", from_validation) - os.chdir("../") + os.chdir("..") else: tar = tarball.split(" ") os.makedirs(LLVM_SRC) @@ -563,6 +578,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, def Main(): global current_OS + global current_OS_version + current_OS_version = platform.release() if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: current_OS = "Windows" else: From a69c4527a1b30a783b4fc90d617f56c74fd7d6af Mon Sep 17 00:00:00 2001 From: Dmitry Babokin Date: Tue, 24 Dec 2013 18:39:17 +0400 Subject: [PATCH 14/14] Bumping up 3.4 version from rc3 to final --- alloy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alloy.py b/alloy.py index ee7a1acf..57d8df1e 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc3" + SVN_PATH="tags/RELEASE_34/final" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final"