diff --git a/alloy.py b/alloy.py index 0055842c..57d8df1e 100755 --- a/alloy.py +++ b/alloy.py @@ -89,7 +89,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, if version_LLVM == "trunk": SVN_PATH="trunk" if version_LLVM == "3.4": - SVN_PATH="tags/RELEASE_34/rc2" + SVN_PATH="tags/RELEASE_34/final" version_LLVM = "3_4" if version_LLVM == "3.3": SVN_PATH="tags/RELEASE_33/final" @@ -129,8 +129,23 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load clang from http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " ", "svn co " + revision + " http://llvm.org/svn/llvm-project/cfe/" + SVN_PATH + " clang", from_validation) + os.chdir("..") + if current_OS == "MacOS" and int(current_OS_version.split(".")[0]) >= 13: + # Starting with MacOS 10.9 Maverics, the system doesn't contain headers for standard C++ library and + # the default library is libc++, bit libstdc++. The headers are part of XCode now. But we are checking out + # headers as part of LLVM source tree, so they will be installed in clang location and clang will be able + # to find them. Though they may not match to the library installed in the system, but seems that this should + # not happen. + # Note, that we can also build a libc++ library, but it must be on system default location or should be passed + # to the linker explicitly (either through command line or environment variables). So we are not doing it + # currently to make the build process easier. + os.chdir("projects") + try_do_LLVM("load libcxx http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " ", + "svn co " + revision + " http://llvm.org/svn/llvm-project/libcxx/" + SVN_PATH + " libcxx", + from_validation) + os.chdir("..") if extra == True: - os.chdir("./clang/tools") + os.chdir("tools/clang/tools") try_do_LLVM("load extra clang extra tools ", "svn co " + revision + " http://llvm.org/svn/llvm-project/clang-tools-extra/" + SVN_PATH + " extra", from_validation) @@ -138,7 +153,7 @@ def build_LLVM(version_LLVM, revision, folder, tarball, debug, selfbuild, extra, try_do_LLVM("load extra clang compiler-rt ", "svn co " + revision + " http://llvm.org/svn/llvm-project/compiler-rt/" + SVN_PATH + " compiler-rt", from_validation) - os.chdir("../") + os.chdir("..") else: tar = tarball.split(" ") os.makedirs(LLVM_SRC) @@ -563,6 +578,8 @@ def validation_run(only, only_targets, reference_branch, number, notify, update, def Main(): global current_OS + global current_OS_version + current_OS_version = platform.release() if (platform.system() == 'Windows' or 'CYGWIN_NT' in platform.system()) == True: current_OS = "Windows" else: diff --git a/builtins/target-generic-1.ll b/builtins/target-generic-1.ll index 43609b33..c43a12a7 100644 --- a/builtins/target-generic-1.ll +++ b/builtins/target-generic-1.ll @@ -3,13 +3,13 @@ define(`MASK',`i32') define(`WIDTH',`1') include(`util.m4') +rdrand_decls() ; Define some basics for a 1-wide target stdlib_core() packed_load_and_store() scans() int64minmax() aossoa() -rdrand_decls() ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; masked store @@ -653,10 +653,121 @@ define <1 x float> @__rsqrt_varying_float(<1 x float> %v) nounwind readonly alw ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; svml stuff -include(`svml.m4') -svml_stubs(float,f,WIDTH) -svml_stubs(double,d,WIDTH) +declare <1 x float> @__svml_sind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_asind(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_cosd(<1 x float>) nounwind readnone alwaysinline +declare void @__svml_sincosd(<1 x float>, <1 x double> *, <1 x double> *) nounwind readnone alwaysinline +declare <1 x float> @__svml_tand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atand(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_atan2d(<1 x float>, <1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_expd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_logd(<1 x float>) nounwind readnone alwaysinline +declare <1 x float> @__svml_powd(<1 x float>, <1 x float>) nounwind readnone alwaysinline +define <1 x float> @__svml_sinf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_sinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.sin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.sin.f32) + +} + +define <1 x float> @__svml_asinf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_asinf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.asin.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float,@llvm.asin.f32) + +} + +define <1 x float> @__svml_cosf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_cosf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm.cos.f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + unary1to1(float, @llvm.cos.f32) + +} + +define void @__svml_sincosf(<1 x float>, <1 x float> *, <1 x float> *) nounwind readnone alwaysinline { +; %s = call <1 x float> @__svml_sincosf4(<1 x float> * %2, <1 x float> %0) +; store <1 x float> %s, <1 x float> * %1 +; ret void + %sin = call <1 x float> @__svml_sinf(<1 x float> %0) + %cos = call <1 x float> @__svml_cosf(<1 x float> %0) + store <1 x float> %sin, <1 x float> * %1 + store <1 x float> %cos, <1 x float> * %2 + ret void +} + +define <1 x float> @__svml_tanf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_tanf4(<1 x float> %0) + ;ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_tan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unasry1to1(float, @llvm.tan.f32) + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_atanf(<1 x float>) nounwind readnone alwaysinline { +; %ret = call <1 x float> @__svml_atanf4(<1 x float> %0) +; ret <1 x float> %ret + ;%r = extractelement <1 x float> %0, i32 0 + ;%s = call float @llvm_atan_f32(float %r) + ;%rv = insertelement <1 x float> undef, float %r, i32 0 + ;ret <1 x float> %rv + ;unsary1to1(float,@llvm.atan.f32) + ;UNSUPPORTED! + ret <1 x float > %0 + +} + +define <1 x float> @__svml_atan2f(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_atan2f4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + ;%y = extractelement <1 x float> %0, i32 0 + ;%x = extractelement <1 x float> %1, i32 0 + ;%q = fdiv float %y, %x + ;%a = call float @llvm.atan.f32 (float %q) + ;%rv = insertelement <1 x float> undef, float %a, i32 0 + ;ret <1 x float> %rv + ; UNSUPPORTED! + ret <1 x float > %0 +} + +define <1 x float> @__svml_expf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_expf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.exp.f32) +} + +define <1 x float> @__svml_logf(<1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_logf4(<1 x float> %0) + ;ret <1 x float> %ret + unary1to1(float, @llvm.log.f32) +} + +define <1 x float> @__svml_powf(<1 x float>, <1 x float>) nounwind readnone alwaysinline { + ;%ret = call <1 x float> @__svml_powf4(<1 x float> %0, <1 x float> %1) + ;ret <1 x float> %ret + %r = extractelement <1 x float> %0, i32 0 + %e = extractelement <1 x float> %1, i32 0 + %s = call float @llvm.pow.f32(float %r,float %e) + %rv = insertelement <1 x float> undef, float %s, i32 0 + ret <1 x float> %rv + +} ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; float min/max @@ -881,14 +992,3 @@ declare @__float_to_half_varying( %v) nounwind read define_avgs() -;;;;;;; nvptx64 - -declare i32 @__tid_x() nounwind readnone alwaysinline -declare i32 @__warpsize() nounwind readnone alwaysinline -declare i32 @__ctaid_x() nounwind readnone alwaysinline -declare i32 @__ctaid_y() nounwind readnone alwaysinline -declare i32 @__ctaid_z() nounwind readnone alwaysinline -declare i32 @__nctaid_x() nounwind readnone alwaysinline -declare i32 @__nctaid_y() nounwind readnone alwaysinline -declare i32 @__nctaid_z() nounwind readnone alwaysinline - diff --git a/builtins/target-generic-common.ll b/builtins/target-generic-common.ll index e57b9036..0a51f44f 100644 --- a/builtins/target-generic-common.ll +++ b/builtins/target-generic-common.ll @@ -371,6 +371,8 @@ declare i32 @__packed_load_active(i32 * nocapture, * nocapture, ) nounwind declare i32 @__packed_store_active(i32 * nocapture, %vals, ) nounwind +declare i32 @__packed_store_active2(i32 * nocapture, %vals, + ) nounwind ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/contrib/ispc.vim b/contrib/ispc.vim index f3cb413b..11808658 100644 --- a/contrib/ispc.vim +++ b/contrib/ispc.vim @@ -18,6 +18,7 @@ syn keyword ispcConditional cif syn keyword ispcRepeat cdo cfor cwhile syn keyword ispcBuiltin programCount programIndex syn keyword ispcType export uniform varying int8 int16 int32 int64 +syn keyword ispcOperator operator "double precision floating point number, with dot, optional exponent syn match cFloat display contained "\d\+\.\d*d[-+]\=\d*\>" @@ -33,6 +34,7 @@ HiLink ispcConditional Conditional HiLink ispcRepeat Repeat HiLink ispcBuiltin Statement HiLink ispcType Type +HiLink ispcOperator Operator delcommand HiLink let b:current_syntax = "ispc" diff --git a/docs/ReleaseNotes.txt b/docs/ReleaseNotes.txt index a8575ea0..b7d0bb17 100644 --- a/docs/ReleaseNotes.txt +++ b/docs/ReleaseNotes.txt @@ -1,3 +1,47 @@ +=== v1.6.0 === (19 December 2013) + +A major new version of ISPC with major improvements in performance and +stability. Linux and MacOS binaries are based on patched version of LLVM 3.3, +while Windows version is based on LLVM 3.4rc3. LLVM 3.4 significantly improves +stability on Win32 platform, so we've decided not to wait for official LLVM 3.4 +release. + +The list of the most significant changes is: + +* New avx1-i32x4 target was added. It may play well for you, if you are focused + on integer computations or FP unit in your hardware is 128 bit wide. + +* Support for calculations in double precision was extended with two new + targets avx1.1-i64x4 and avx2-i64x4. + +* Language support for overloaded operators was added. + +* New library shift() function was added, which is similar to rotate(), but is + non-circular. + +* The language was extended to accept 3 dimensional tasking - a syntactic sugar, + which may facilitate programming of some tasks. + +* Regression, which broke --opt=force-aligned-memory is fixed. + +If you are not using pre-built binaries, you may notice the following changes: + +* VS2012/VS2013 are supported. + +* alloy.py (with -b switch) can build LLVM for you on any platform now + (except MacOS 10.9, but we know about the problem and working on it). + This is a preferred way to build LLVM for ISPC, as all required patches for + better performance and stability will automatically apply. + +* LLVM 3.5 (current trunk) is supported. + +There are also multiple fixes for better performance and stability, most +notable are: + +* Fixed performance problem for x2 targets. + +* Fixed a problem with incorrect vzeroupper insertion on AVX target on Win32. + === v1.5.0 === (27 September 2013) A major new version of ISPC with several new targets and important bug fixes. diff --git a/docs/ispc.rst b/docs/ispc.rst index 7e76f433..9464dcde 100644 --- a/docs/ispc.rst +++ b/docs/ispc.rst @@ -48,6 +48,8 @@ Contents: + `Updating ISPC Programs For Changes In ISPC 1.1`_ + `Updating ISPC Programs For Changes In ISPC 1.2`_ + `Updating ISPC Programs For Changes In ISPC 1.3`_ + + `Updating ISPC Programs For Changes In ISPC 1.5.0`_ + + `Updating ISPC Programs For Changes In ISPC 1.6.0`_ * `Getting Started with ISPC`_ @@ -97,6 +99,9 @@ Contents: * `Short Vector Types`_ * `Array Types`_ * `Struct Types`_ + + + `Operators Overloading`_ + * `Structure of Array Types`_ + `Declarations and Initializers`_ @@ -279,6 +284,15 @@ Double precision floating point constants are floating point number with 31.4d-1, 1.d, 1.0d, 1d-2. Note that floating point number without suffix is treated as single precision constant. +Updating ISPC Programs For Changes In ISPC 1.6.0 +------------------------------------------------ + +This release adds support for `Operators Overloading`_, so a word ``operator`` +becomes a keyword and it potentially creates a conflict with existing user +function. Also a new library function packed_store_active2() was introduced, +which also may create a conflict with existing user functions. + + Getting Started with ISPC ========================= @@ -1325,6 +1339,7 @@ in C: * Function overloading by parameter type * Hexadecimal floating-point constants * Dynamic memory allocation with ``new`` and ``delete``. +* Limited support for overloaded operators (`Operators Overloading`_). ``ispc`` also adds a number of new features that aren't in C89, C99, or C++: @@ -2122,7 +2137,35 @@ above code, the value of ``f[index]`` needs to be able to store a different value of ``Foo::a`` for each program instance. However, a ``varying Foo`` still has only a single ``a`` member, since ``a`` was declared with ``uniform`` variability in the declaration of ``Foo``. Therefore, the -indexing operation in the last line results in an error. +indexing operation in the last line results in an error. + + +Operators Overloading +--------------------- + +ISPC has limited support for overloaded operators for ``struct`` types. Only +binary operators are supported currently, namely they are: ``*, /, %, +, -, >> +and <<``. Operators overloading support is similar to the one in C++ language. +To overload an operator for ``struct S``, you need to declare and implement a +function using keyword ``operator``, which accepts two parameters of type +``struct S`` or ``struct S&`` and returns either of these types. For example: + +:: + + struct S { float re, im;}; + struct S operator*(struct S a, struct S b) { + struct S result; + result.re = a.re * b.re - a.im * b.im; + result.im = a.re * b.im + a.im * b.re; + return result; + } + + void foo(struct S a, struct S b) { + struct S mul = a*b; + print("a.re: %\na.im: %\n", a.re, a.im); + print("b.re: %\nb.im: %\n", b.re, b.im); + print("mul.re: %\nmul.im: %\n", mul.re, mul.im); + } Structure of Array Types @@ -4050,6 +4093,14 @@ They return the total number of values stored. unsigned int val) +There are also ``packed_store_active2()`` functions with exactly the same +signatures and the same semantic except that they may write one extra +element to the output array (but still returning the same value as +``packed_store_active()``). These functions suggest different branch free +implementation on most of supported targets, which usually (but not always) +performs better than ``packed_store_active()``. It's advised to test function +performance on user's scenarios on particular target hardware before using it. + As an example of how these functions can be used, the following code shows the use of ``packed_store_active()``. diff --git a/docs/news.rst b/docs/news.rst index 7d78a662..6a805e48 100644 --- a/docs/news.rst +++ b/docs/news.rst @@ -2,6 +2,16 @@ ispc News ========= +ispc 1.6.0 is Released +---------------------- + +A major update of ``ispc`` has been released. The main focus is on improved +performance and stability. Several new targets were added. There are also +a number of language and library extensions. Released binaries are based on +patched LLVM 3.3 on Linux and MacOS and LLVM 3.4rc3 on Windows. Please refer +to Release Notes for complete set of changes. + + ispc 1.5.0 is Released ---------------------- diff --git a/doxygen.cfg b/doxygen.cfg index a0ad3176..9a8f88e5 100644 --- a/doxygen.cfg +++ b/doxygen.cfg @@ -31,7 +31,7 @@ PROJECT_NAME = "Intel SPMD Program Compiler" # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 1.5.1dev +PROJECT_NUMBER = 1.6.1dev # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/examples/aobench/aobench.vcxproj b/examples/aobench/aobench.vcxproj index c46ee41a..298be2cb 100644 --- a/examples/aobench/aobench.vcxproj +++ b/examples/aobench/aobench.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {F29204CA-19DF-4F3C-87D5-03F4EEDAAFEB} Win32Proj diff --git a/examples/common.props b/examples/common.props index 7bf37005..3769330b 100644 --- a/examples/common.props +++ b/examples/common.props @@ -146,24 +146,24 @@ ispc $(default_targets) - $(TargetDir)$(ISPC_file).obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse2.obj - $(Target_out);$(TargetDir)$(ISPC_file)_sse4.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx11.obj - $(Target_out);$(TargetDir)$(ISPC_file)_avx2.obj + $(ISPC_file).obj + $(Target_out);$(ISPC_file)_sse2.obj + $(Target_out);$(ISPC_file)_sse4.obj + $(Target_out);$(ISPC_file)_avx.obj + $(Target_out);$(ISPC_file)_avx11.obj + $(Target_out);$(ISPC_file)_avx2.obj Document - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --arch=x86 --target=$(Target_str) - $(ISPC_compiler) -O2 %(Filename).ispc -o $(TargetDir)%(Filename).obj -h $(TargetDir)%(Filename)_ispc.h --target=$(Target_str) - $(Target_out);$(TargetDir)%(Filename)_ispc.h - $(Target_out);$(TargetDir)%(Filename)_ispc.h + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --arch=x86 --target=$(Target_str) + $(ISPC_compiler) -O2 %(Filename).ispc -o %(Filename).obj -h %(Filename)_ispc.h --target=$(Target_str) + $(Target_out) + $(Target_out) diff --git a/examples/deferred/deferred_shading.vcxproj b/examples/deferred/deferred_shading.vcxproj index cd361b26..974e870b 100755 --- a/examples/deferred/deferred_shading.vcxproj +++ b/examples/deferred/deferred_shading.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {87f53c53-957e-4e91-878a-bc27828fb9eb} Win32Proj diff --git a/examples/intrinsics/generic-16.h b/examples/intrinsics/generic-16.h index fa794276..0aa8a3f6 100644 --- a/examples/intrinsics/generic-16.h +++ b/examples/intrinsics/generic-16.h @@ -1472,31 +1472,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec16_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec16_i32 val, + __vec16_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 16; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec16_i32 *val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec16_i32 val, __vec16_i1 mask) { - int count = 0; - for (int i = 0; i < 16; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec16_i32 val, + __vec16_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-32.h b/examples/intrinsics/generic-32.h index 531ed215..924b049d 100644 --- a/examples/intrinsics/generic-32.h +++ b/examples/intrinsics/generic-32.h @@ -1523,31 +1523,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec32_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec32_i32 val, + __vec32_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 32; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec32_i32 *val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *)ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec32_i32 val, __vec32_i1 mask) { - int count = 0; - for (int i = 0; i < 32; ++i) { - if ((mask.v & (1 << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *)ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec32_i32 val, + __vec32_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); } diff --git a/examples/intrinsics/generic-64.h b/examples/intrinsics/generic-64.h index bbeb007a..b1451c96 100644 --- a/examples/intrinsics/generic-64.h +++ b/examples/intrinsics/generic-64.h @@ -1656,31 +1656,38 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec64_i32 val, return count; } + +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec64_i32 val, + __vec64_i1 mask) { + int count = 0; + int32_t *ptr_ = ptr; + for (int i = 0; i < 64; ++i) { + *ptr = val.v[i]; + ptr += mask.v & 1; + mask.v = mask.v >> 1; + } + return ptr - ptr_; +} + + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec64_i32 *val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - val->v[i] = *ptr++; - ++count; - } - } - return count; + return __packed_load_active((int32_t *) ptr, val, mask); } static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec64_i32 val, __vec64_i1 mask) { - int count = 0; - for (int i = 0; i < 64; ++i) { - if ((mask.v & (1ull << i)) != 0) { - *ptr++ = val.v[i]; - ++count; - } - } - return count; + return __packed_store_active((int32_t *) ptr, val, mask); +} + + +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, + __vec64_i32 val, + __vec64_i1 mask) { + return __packed_store_active2((int32_t *) ptr, val, mask); } diff --git a/examples/intrinsics/knc-i1x16.h b/examples/intrinsics/knc-i1x16.h index ef14d26e..141c47bb 100644 --- a/examples/intrinsics/knc-i1x16.h +++ b/examples/intrinsics/knc-i1x16.h @@ -2451,20 +2451,24 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, _ return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec16_i32 *val, __vec16_i1 mask) { - __vec16_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec16_i32 val, __vec16_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(mask)); + return __packed_store_active((uint32_t *)p, val, mask); +} + +static FORCEINLINE int32_t __packed_store_active2(int32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); } /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc-i1x8.h b/examples/intrinsics/knc-i1x8.h index d7696117..32f39c4a 100644 --- a/examples/intrinsics/knc-i1x8.h +++ b/examples/intrinsics/knc-i1x8.h @@ -2496,20 +2496,23 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec8_i32 val, _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); return _mm_countbits_32(uint32_t(0xFF & mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} static FORCEINLINE int32_t __packed_load_active(int32_t *p, __vec8_i32 *val, __vec8_i1 mask) { - __vec8_i32 v = __load<64>(val); - v = _mm512_mask_extloadunpacklo_epi32(v, 0xFF & mask, p, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - v = _mm512_mask_extloadunpackhi_epi32(v, 0xFF & mask, (uint8_t*)p+64, _MM_UPCONV_EPI32_NONE, _MM_HINT_NONE); - __store<64>(val, v); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_load_active((uint32_t *)p, val, mask); } static FORCEINLINE int32_t __packed_store_active(int32_t *p, __vec8_i32 val, __vec8_i1 mask) { - _mm512_mask_extpackstorelo_epi32(p, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - _mm512_mask_extpackstorehi_epi32((uint8_t*)p+64, 0xFF & mask, val, _MM_DOWNCONV_EPI32_NONE, _MM_HINT_NONE); - return _mm_countbits_32(uint32_t(0xFF & mask)); + return __packed_store_active((uint32_t *)p, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active(ptr, val, mask); +} + #endif /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/knc.h b/examples/intrinsics/knc.h index 8baef8cb..0077ad88 100644 --- a/examples/intrinsics/knc.h +++ b/examples/intrinsics/knc.h @@ -1260,6 +1260,13 @@ static FORCEINLINE __vec16_i64 __cast_zext(const __vec16_i64 &, const __vec16_i3 return __vec16_i64(val.v, _mm512_setzero_epi32()); } +static FORCEINLINE __vec16_i32 __cast_sext(const __vec16_i32 &, const __vec16_i1 &val) +{ + __vec16_i32 ret = _mm512_setzero_epi32(); + __vec16_i32 one = _mm512_set1_epi32(-1); + return _mm512_mask_mov_epi32(ret, val, one); +} + static FORCEINLINE __vec16_i32 __cast_zext(const __vec16_i32 &, const __vec16_i1 &val) { __vec16_i32 ret = _mm512_setzero_epi32(); @@ -1878,6 +1885,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *p, __vec16_i32 val, return _mm_countbits_32(uint32_t(mask)); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *p, __vec16_i32 val, __vec16_i1 mask) +{ + return __packed_store_active(p, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // prefetch /////////////////////////////////////////////////////////////////////////// diff --git a/examples/intrinsics/sse4.h b/examples/intrinsics/sse4.h index 919716be..5dd424d9 100644 --- a/examples/intrinsics/sse4.h +++ b/examples/intrinsics/sse4.h @@ -3798,6 +3798,25 @@ static FORCEINLINE int32_t __packed_store_active(int32_t *ptr, __vec4_i32 val, return count; } +static FORCEINLINE int32_t __packed_store_active2(int32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + int count = 0; + + ptr[count] = _mm_extract_epi32(val.v, 0); + count -= _mm_extract_ps(mask.v, 0); + + ptr[count] = _mm_extract_epi32(val.v, 1); + count -= _mm_extract_ps(mask.v, 1); + + ptr[count] = _mm_extract_epi32(val.v, 2); + count -= _mm_extract_ps(mask.v, 2); + + ptr[count] = _mm_extract_epi32(val.v, 3); + count -= _mm_extract_ps(mask.v, 3); + + return count; +} + static FORCEINLINE int32_t __packed_load_active(uint32_t *ptr, __vec4_i32 *val, __vec4_i1 mask) { return __packed_load_active((int32_t *)ptr, val, mask); @@ -3808,6 +3827,11 @@ static FORCEINLINE int32_t __packed_store_active(uint32_t *ptr, __vec4_i32 val, return __packed_store_active((int32_t *)ptr, val, mask); } +static FORCEINLINE int32_t __packed_store_active2(uint32_t *ptr, __vec4_i32 val, + __vec4_i1 mask) { + return __packed_store_active2((int32_t *)ptr, val, mask); +} + /////////////////////////////////////////////////////////////////////////// // aos/soa diff --git a/examples/mandelbrot/mandelbrot.vcxproj b/examples/mandelbrot/mandelbrot.vcxproj index e7703ad0..7a5f6e03 100644 --- a/examples/mandelbrot/mandelbrot.vcxproj +++ b/examples/mandelbrot/mandelbrot.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C1} Win32Proj diff --git a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj index f8b8cfcb..113fc4e8 100644 --- a/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj +++ b/examples/mandelbrot_tasks/mandelbrot_tasks.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E80DA7D4-AB22-4648-A068-327307156BE6} Win32Proj diff --git a/examples/noise/noise.vcxproj b/examples/noise/noise.vcxproj index 7adc57f3..ff3953ae 100644 --- a/examples/noise/noise.vcxproj +++ b/examples/noise/noise.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {0E0886D8-8B5E-4EAF-9A21-91E63DAF81FD} Win32Proj diff --git a/examples/options/options.vcxproj b/examples/options/options.vcxproj index af336aa1..d48ac8bc 100644 --- a/examples/options/options.vcxproj +++ b/examples/options/options.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {8C7B5D29-1E76-44E6-BBB8-09830E5DEEAE} Win32Proj diff --git a/examples/rt/rt.vcxproj b/examples/rt/rt.vcxproj index ea34de56..00b6dd3a 100644 --- a/examples/rt/rt.vcxproj +++ b/examples/rt/rt.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {E787BC3F-2D2E-425E-A64D-4721E2FF3DC9} Win32Proj diff --git a/examples/sort/sort.vcxproj b/examples/sort/sort.vcxproj index 43f2b439..b0bdc63d 100644 --- a/examples/sort/sort.vcxproj +++ b/examples/sort/sort.vcxproj @@ -1,5 +1,23 @@  + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {6D3EF8C5-AE26-407B-9ECE-C27CB988D9C2} Win32Proj diff --git a/examples/stencil/stencil.vcxproj b/examples/stencil/stencil.vcxproj index b5f5bb22..fd8564aa 100644 --- a/examples/stencil/stencil.vcxproj +++ b/examples/stencil/stencil.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {2ef070a1-f62f-4e6a-944b-88d140945c3c} Win32Proj diff --git a/examples/tasksys.cpp b/examples/tasksys.cpp index f8b2dfec..77269f9f 100644 --- a/examples/tasksys.cpp +++ b/examples/tasksys.cpp @@ -170,17 +170,44 @@ // Signature of ispc-generated 'task' functions typedef void (*TaskFuncType)(void *data, int threadIndex, int threadCount, - int taskIndex, int taskCount); + int taskIndex, int taskCount, + int taskIndex0, int taskIndex1, int taskIndex2, + int taskCount0, int taskCount1, int taskCount2); // Small structure used to hold the data for each task +#ifdef _MSC_VER +__declspec(align(16)) +#endif struct TaskInfo { TaskFuncType func; void *data; - int taskIndex, taskCount; + int taskIndex; + int taskCount3d[3]; #if defined(ISPC_IS_WINDOWS) event taskEvent; #endif -}; + int taskCount() const { return taskCount3d[0]*taskCount3d[1]*taskCount3d[2]; } + int taskIndex0() const + { + return taskIndex % taskCount3d[0]; + } + int taskIndex1() const + { + return ( taskIndex / taskCount3d[0] ) % taskCount3d[1]; + } + int taskIndex2() const + { + return taskIndex / ( taskCount3d[0]*taskCount3d[1] ); + } + int taskCount0() const { return taskCount3d[0]; } + int taskCount1() const { return taskCount3d[1]; } + int taskCount2() const { return taskCount3d[2]; } + TaskInfo() { assert(sizeof(TaskInfo) % 32 == 0); } +} +#ifndef _MSC_VER +__attribute__((aligned(32))); +#endif +; // ispc expects these functions to have C linkage / not be mangled extern "C" { @@ -518,7 +545,9 @@ lRunTask(void *ti) { // Actually run the task taskInfo->func(taskInfo->data, threadIndex, threadCount, - taskInfo->taskIndex, taskInfo->taskCount); + taskInfo->taskIndex, taskInfo->taskCount(), + taskInfo->taskIndex0(), taskInfo->taskIndex1(), taskInfo->taskIndex2(), + taskInfo->taskCount0(), taskInfo->taskCount1(), taskInfo->taskCount2()); } @@ -559,7 +588,9 @@ lRunTask(LPVOID param) { // will cause bugs in code that uses those. int threadIndex = 0; int threadCount = 1; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); // Signal the event that this task is done ti->taskEvent.set(); @@ -660,7 +691,9 @@ lTaskEntry(void *arg) { DBG(fprintf(stderr, "running task %d from group %p\n", taskNumber, tg)); TaskInfo *myTask = tg->GetTaskInfo(taskNumber); myTask->func(myTask->data, threadIndex, threadCount, myTask->taskIndex, - myTask->taskCount); + myTask->taskCount(), + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the "number of unfinished tasks" counter in the task @@ -871,7 +904,9 @@ TaskGroup::Sync() { // Do work for _myTask_ // // FIXME: bogus values for thread index/thread count here as well.. - myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount); + myTask->func(myTask->data, 0, 1, myTask->taskIndex, myTask->taskCount(), + myTask->taskIndex0(), myTask->taskIndex1(), myTask->taskIndex2(), + myTask->taskCount0(), myTask->taskCount1(), myTask->taskCount2()); // // Decrement the number of unfinished tasks counter @@ -901,7 +936,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. // Cilk does not expose the task -> thread mapping so we pretend it's 1:1 - ti->func(ti->data, ti->taskIndex, ti->taskCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -930,7 +967,9 @@ TaskGroup::Launch(int baseIndex, int count) { // Actually run the task. int threadIndex = omp_get_thread_num(); int threadCount = omp_get_num_threads(); - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); } } @@ -961,7 +1000,9 @@ TaskGroup::Launch(int baseIndex, int count) { int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } @@ -988,7 +1029,9 @@ TaskGroup::Launch(int baseIndex, int count) { // TBB does not expose the task -> thread mapping so we pretend it's 1:1 int threadIndex = ti->taskIndex; int threadCount = ti->taskCount; - ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount); + ti->func(ti->data, threadIndex, threadCount, ti->taskIndex, ti->taskCount(), + ti->taskIndex0(), ti->taskIndex1(), ti->taskIndex2(), + ti->taskCount0(), ti->taskCount1(), ti->taskCount2()); }); } } @@ -1041,7 +1084,8 @@ FreeTaskGroup(TaskGroup *tg) { /////////////////////////////////////////////////////////////////////////// void -ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { +ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count0, int count1, int count2) { + const int count = count0*count1*count2; TaskGroup *taskGroup; if (*taskGroupPtr == NULL) { InitTaskSystem(); @@ -1057,7 +1101,9 @@ ISPCLaunch(void **taskGroupPtr, void *func, void *data, int count) { ti->func = (TaskFuncType)func; ti->data = data; ti->taskIndex = i; - ti->taskCount = count; + ti->taskCount3d[0] = count0; + ti->taskCount3d[1] = count1; + ti->taskCount3d[2] = count2; } taskGroup->Launch(baseIndex, count); } diff --git a/examples/volume_rendering/volume.vcxproj b/examples/volume_rendering/volume.vcxproj index cc738a7e..a1fea5f1 100644 --- a/examples/volume_rendering/volume.vcxproj +++ b/examples/volume_rendering/volume.vcxproj @@ -1,5 +1,23 @@ + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + {dee5733a-e93e-449d-9114-9bffcaeb4df9} Win32Proj diff --git a/ispc.h b/ispc.h index 652390d2..d649b6cd 100644 --- a/ispc.h +++ b/ispc.h @@ -38,7 +38,7 @@ #ifndef ISPC_H #define ISPC_H -#define ISPC_VERSION "1.5.1dev" +#define ISPC_VERSION "1.6.1dev" #if !defined(LLVM_3_1) && !defined(LLVM_3_2) && !defined(LLVM_3_3) && !defined(LLVM_3_4) && !defined(LLVM_3_5) #error "Only LLVM 3.1, 3.2, 3.3, 3.4 and the 3.5 development branch are supported" diff --git a/opt.cpp b/opt.cpp index c75d4225..ff7ee410 100644 --- a/opt.cpp +++ b/opt.cpp @@ -5153,6 +5153,11 @@ FixBooleanSelectPass::runOnFunction(llvm::Function &F) { // LLVM 3.3 only #if defined(LLVM_3_3) + // Don't optimize generic targets. + if (g->target->getISA() == Target::GENERIC) { + return false; + } + for (llvm::Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { llvm::BasicBlock* bb = &*I;